import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) import git from llama_index import SimpleDirectoryReader from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext documents = SimpleDirectoryReader("./").load_data() import torch from llama_index.llms import LlamaCPP from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf', # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=None, temperature=0.1, max_new_tokens=256, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": -1}, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) from langchain.embeddings.huggingface import HuggingFaceEmbeddings from llama_index.embeddings import LangchainEmbedding embed_model = LangchainEmbedding( HuggingFaceEmbeddings(model_name="thenlper/gte-large") ) service_context = ServiceContext.from_defaults( chunk_size=256, llm=llm, embed_model=embed_model ) index = VectorStoreIndex.from_documents(documents, service_context=service_context) # Create a Streamlit app file (e.g., app.py) and run it import streamlit as st from transformers import GPT2LMHeadModel, GPT2Tokenizer def generate_response(prompt): model_name = "gpt2" model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) input_ids = tokenizer.encode(prompt, return_tensors="pt") output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2) generated_text = tokenizer.decode(output[0], skip_special_tokens=True) return generated_text def main(): st.title("Cloudflare RAG") # User input user_input = st.text_input("Enter your message:") if user_input: # Generate response query_engine = index.as_query_engine() response = query_engine.query(user_input) # Display response st.text_area("ChatGPT Response:", response, height=100) if __name__ == "__main__": main()