caffeinatedcherrychic commited on
Commit
db328d1
1 Parent(s): d38a8cf

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. README.md +3 -9
  2. app.py +61 -0
  3. backend.py +66 -0
  4. flask.py.save +18 -0
  5. flask.py.save.1 +18 -0
  6. oldbacked.py +83 -0
  7. requirements.txt +1 -0
  8. server.py +8 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Gradio Server
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.25.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: gradio-server
3
+ app_file: backend.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.50.2
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from langchain.memory import ConversationBufferWindowMemory
5
+ from peft import PeftModel
6
+ import torch
7
+ import re
8
+
9
+
10
+ print("Initializing model")
11
+ # Initialize the tokenizer and model
12
+ base_model = "mistralai/Mistral-7B-Instruct-v0.2"
13
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
14
+ tokenizer.add_special_tokens({"pad_token": "[PAD]"})
15
+ base_model = AutoModelForCausalLM.from_pretrained(base_model)
16
+
17
+ ft_model = PeftModel.from_pretrained(base_model, "nuratamton/story_sculptor_mistral")
18
+ # ft_model = ft_model.merge_and_unload()
19
+ ft_model.eval()
20
+
21
+
22
+ # Set the device
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ ft_model.to(device)
25
+
26
+
27
+ memory = ConversationBufferWindowMemory(k=10)
28
+
29
+ def slow_echo(message, history):
30
+ message = chat_interface(message)
31
+ for i in range(len(message)):
32
+ time.sleep(0.05)
33
+ yield message[: i+1]
34
+
35
+
36
+ def chat_interface(user_in):
37
+ if user_in.lower() == "quit":
38
+ return "Goodbye!"
39
+ #memory.save_context({"input": user_in}, {"output": ""})
40
+ memory_context = memory.load_memory_variables({})["history"]
41
+ user_input = f"[INST] Continue the game and maintain context and keep the story consistent throughout: {memory_context}{user_in}[/INST]"
42
+
43
+ encodings = tokenizer(user_input, return_tensors="pt", padding=True).to(device)
44
+ input_ids = encodings["input_ids"]
45
+ attention_mask = encodings["attention_mask"]
46
+
47
+ output_ids = ft_model.generate(input_ids, attention_mask = attention_mask, max_new_tokens=1000, num_return_sequences=1, do_sample=True, temperature=1.1, top_p=0.9, repetition_penalty=1.2)
48
+
49
+ generated_ids = output_ids[0, input_ids.shape[-1]:]
50
+
51
+ # Decode the output
52
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True)
53
+ memory.save_context({"input": user_in}, {"output": response})
54
+ print(f"Game Agent: {response}")
55
+ # Your chatbot logic here
56
+ # response = "You said: " + user_in
57
+ return response
58
+
59
+
60
+ iface = gr.ChatInterface(slow_echo).queue()
61
+ iface.launch(share=True)
backend.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from peft import PeftModel
4
+ from langchain.memory import ConversationBufferWindowMemory
5
+ import gradio as gr
6
+
7
+ bnb_config = BitsAndBytesConfig(
8
+ load_in_4bit=True,
9
+ bnb_4bit_use_double_quant=True,
10
+ bnb_4bit_quant_type="nf4",
11
+ bnb_4bit_compute_dtype=torch.bfloat16,
12
+ )
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ base_model = "mistralai/Mistral-7B-Instruct-v0.2"
16
+ tokenizer = AutoTokenizer.from_pretrained(base_model, pad_token="[PAD]")
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ base_model,
19
+ quantization_config=bnb_config,
20
+ device_map="auto",
21
+ trust_remote_code=True,
22
+ )
23
+ ft_model = PeftModel.from_pretrained(model, "nuratamton/story_sculptor_mistral").eval()
24
+ memory = ConversationBufferWindowMemory(k=10)
25
+
26
+ def generate_text(message):
27
+ user_in = message
28
+
29
+ if user_in.lower() in ["adventure", "mystery", "horror", "sci-fi"]:
30
+ memory.clear()
31
+
32
+ if user_in.lower() == "quit":
33
+ raise ValueError("User requested to quit")
34
+
35
+ memory_context = memory.load_memory_variables({})["history"]
36
+ user_input = f"{memory_context}[INST] Continue the game and maintain context: {user_in}[/INST]"
37
+
38
+ encodings = tokenizer(user_input, return_tensors="pt", padding=True).to(device)
39
+ input_ids, attention_mask = encodings["input_ids"], encodings["attention_mask"]
40
+ output_ids = ft_model.generate(
41
+ input_ids,
42
+ attention_mask=attention_mask,
43
+ max_new_tokens=1000,
44
+ num_return_sequences=1,
45
+ do_sample=True,
46
+ temperature=1.1,
47
+ top_p=0.9,
48
+ repetition_penalty=1.2,
49
+ )
50
+
51
+ generated_ids = output_ids[0, input_ids.shape[-1] :]
52
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True)
53
+ memory.save_context({"input": user_in}, {"output": response})
54
+
55
+ response = response.replace("AI: ", "")
56
+ return response
57
+
58
+ iface = gr.Interface(
59
+ fn=generate_text,
60
+ inputs="text",
61
+ outputs="text",
62
+ title="Text Generation",
63
+ description="Enter a message to generate text.",
64
+ )
65
+
66
+ iface.launch(share=True)
flask.py.save ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import gradio as gr
3
+
4
+ app = Flask(__name__)
5
+
6
+ def my_gradio_function(input_text):
7
+ # Your processing logic here
8
+ return "Processed: " + input_text
9
+
10
+ @app.route("/process", methods=["POST"])
11
+ def process():
12
+ input_text = request.json["input_text"]
13
+ output_text = my_gradio_function(input_text)
14
+ return jsonify({"output_text": output_text})
15
+
16
+ if __name__ == "__main__":
17
+ app.run(port=5000)
18
+
flask.py.save.1 ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import gradio as gr
3
+
4
+ app = Flask(__name__)
5
+
6
+ def my_gradio_function(input_text):
7
+ # Your processing logic here
8
+ return "Processed: " + input_text
9
+
10
+ @app.route("/process", methods=["POST"])
11
+ def process():
12
+ input_text = request.json["input_text"]
13
+ output_text = my_gradio_function(input_text)
14
+ return jsonify({"output_text": output_text})
15
+
16
+ if __name__ == "__main__":
17
+ app.run(port=5000)
18
+
oldbacked.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
+ from peft import PeftModel
6
+ from langchain.memory import ConversationBufferWindowMemory
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+
9
+ app = FastAPI()
10
+
11
+ # Add CORSMiddleware to the application
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ bnb_config = BitsAndBytesConfig(
21
+ load_in_4bit=True,
22
+ bnb_4bit_use_double_quant=True,
23
+ bnb_4bit_quant_type="nf4",
24
+ bnb_4bit_compute_dtype=torch.bfloat16,
25
+ )
26
+
27
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+ base_model = "mistralai/Mistral-7B-Instruct-v0.2"
29
+ tokenizer = AutoTokenizer.from_pretrained(base_model, pad_token="[PAD]")
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ base_model,
32
+ quantization_config=bnb_config,
33
+ device_map="auto",
34
+ trust_remote_code=True,
35
+ )
36
+ ft_model = PeftModel.from_pretrained(model, "nuratamton/story_sculptor_mistral").eval()
37
+ memory = ConversationBufferWindowMemory(k=10)
38
+
39
+
40
+ class UserRequest(BaseModel):
41
+ message: str
42
+
43
+
44
+ @app.post("/generate/")
45
+ async def generate_text(request: UserRequest):
46
+ user_in = request.message
47
+
48
+ if user_in.lower() in ["adventure", "mystery", "horror", "sci-fi"]:
49
+ memory.clear()
50
+
51
+ if user_in.lower() == "quit":
52
+ raise HTTPException(status_code=400, detail="User requested to quit")
53
+
54
+ memory_context = memory.load_memory_variables({})["history"]
55
+ user_input = f"{memory_context}[INST] Continue the game and maintain context: {user_in}[/INST]"
56
+
57
+ encodings = tokenizer(user_input, return_tensors="pt", padding=True).to(device)
58
+ input_ids, attention_mask = encodings["input_ids"], encodings["attention_mask"]
59
+ output_ids = ft_model.generate(
60
+ input_ids,
61
+ attention_mask=attention_mask,
62
+ max_new_tokens=1000,
63
+ num_return_sequences=1,
64
+ do_sample=True,
65
+ temperature=1.1,
66
+ top_p=0.9,
67
+ repetition_penalty=1.2,
68
+ )
69
+
70
+ generated_ids = output_ids[0, input_ids.shape[-1] :]
71
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True)
72
+ memory.save_context({"input": user_in}, {"output": response})
73
+
74
+ response = response.replace("AI: ", "")
75
+ # response = response.replace("Human: ", "")
76
+
77
+ return {"response": response}
78
+
79
+
80
+ @app.get("/")
81
+ def read_root():
82
+ return {"message": "Hello from FastAPI"}
83
+
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torch transformers peft langchain
server.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def add_numbers(num1, num2):
4
+ return num1 + num2
5
+
6
+ iface = gr.Interface(fn=add_numbers, inputs=["number", "number"], outputs="number")
7
+ iface.launch(share=True)
8
+