Spaces:
Build error
Build error
File size: 3,599 Bytes
257b0ba d8dbda0 257b0ba f610abe 257b0ba df1ddcb aa905fa df1ddcb 1b9d133 257b0ba 3ca82c1 257b0ba f610abe 6390543 257b0ba f610abe 257b0ba f610abe 257b0ba f610abe 257b0ba f610abe 257b0ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os, pinecone, time, transformers
from datasets import load_dataset
from torch import bfloat16
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
import ctransformers
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cpu'
embed_model = HuggingFaceEmbeddings(
model_name=embed_model_id,
model_kwargs={'device': device},
encode_kwargs={'device': device, 'batch_size': 32}
)
docs = [
'This is a document',
'and another document'
]
embeddings = embed_model.embed_documents(docs)
api_key = os.environ.get('PINECONE_API_KEY')
env_name = os.environ.get('PINECONE_ENV')
pinecone.init(
api_key=api_key,
environment=env_name
)
index_name = 'llama-2-rag'
if index_name not in pinecone.list_indexes():
pinecone.create_index(
index_name,
dimension=len(embeddings[0]),
metric='cosine'
)
while not pinecone.describe_index(index_name).status['ready']:
time.sleep(1)
index = pinecone.Index(index_name)
data = load_dataset('jamescalam/llama-2-arxiv-papers-chunked', split='train')
data = data.to_pandas()
batch_size = 32
for i in range(0, len(data), batch_size):
i_end = min(len(data), i+batch_size)
batch = data.iloc[i:i_end]
ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
texts = [x['chunk'] for i, x in batch.iterrows()]
embeds = embed_model.embed_documents(texts)
metadata = [
{'text': x['chunk'],
'source': x['source'],
'title': x['title']} for i, x in batch.iterrows()
]
index.upsert(vectors=zip(ids, embeds, metadata))
#model_id = "TheBloke/Llama-2-7B-GGML"
#model_id = "TheBloke/Llama-2-7B-chat-GGML"
#model_id = "TheBloke/Llama-2-13B-GGML"
model_id = "TheBloke/Llama-2-13B-chat-GGML"
hf_auth = os.environ.get('HF_AUTH_KEY')
# bnb_config = transformers.BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type='nf4',
# bnb_4bit_use_double_quant=True,
# bnb_4bit_compute_dtype=bfloat16,
# )
# model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)
# model = transformers.AutoModelForCausalLM.from_pretrained(
# model_id,
# trust_remote_code=True,
# config=model_config,
# quantization_config=bnb_config,
# device_map='auto',
# use_auth_token=hf_auth
# )
# model.eval()
# tokenizer = transformer.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
## Using GGML Llama
config = {
'max_new_tokens': 512,
'repetition_penalty': 1.1,
'temperature': 0.3,
'stream': True
}
model = ctransformers.AutoModelForCausalLM.from_pretrained(
model_id,
model_type='llama',
gpu_layers=130, # 110 for 7b, 130 for 13b
hf=True,
**config
)
tokenizer = ctransformers.AutoTokenizer.from_pretrained(model)
generate_text = transformers.pipeline(
model=model,
tokenizer=tokenizer,
return_full_text=True,
task='text-generation',
temperature=0.3,
max_new_tokens=512,
repetition_penalty=1.1
)
llm = HuggingFacePipeline(pipeline=generate_text)
text_field = 'text'
vectorstore = Pinecone(index, embed_model.embed_query, text_field)
rag_pipeline = RetrievalQA.from_chain_type(
llm=llm,
chain_type='stuff',
retriever=vectorstore.as_retriever()
)
title = 'arxiv-retrieval'
def predict(input):
return rag_pipeline(input)['result']
gr.Interface(
fn=predict,
inputs=['text', 'state'],
outputs=['chatbot', 'state']
).launch() |