made1570 commited on
Commit
da40d5c
·
verified ·
1 Parent(s): 76ca090

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -44
app.py CHANGED
@@ -1,48 +1,61 @@
 
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
3
  import gradio as gr
4
-
5
- # Load model and tokenizer using Unsloth-style
6
- model_name = "adarsh3601/my_gemma3_pt"
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
9
-
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
-
12
- def chat(user_input, history):
13
- messages = []
14
- for user_msg, bot_msg in history:
15
- messages.append({"role": "user", "content": user_msg})
16
- messages.append({"role": "assistant", "content": bot_msg})
17
- messages.append({"role": "user", "content": user_input})
18
-
19
- # Apply chat template
20
- prompt = tokenizer.apply_chat_template(
 
 
 
 
 
 
 
 
 
21
  messages,
22
- add_generation_prompt=True,
23
- tokenize=False
24
- )
25
-
26
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
27
-
28
- outputs = model.generate(
29
- **inputs,
30
- max_new_tokens=1024,
31
- temperature=1.0,
32
- top_p=0.95,
33
- top_k=64,
34
- do_sample=True,
35
- pad_token_id=tokenizer.pad_token_id,
36
- eos_token_id=tokenizer.eos_token_id
37
  )
38
-
39
- # Decode and extract just the last assistant message
40
- decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
41
- if "<start_of_turn>assistant" in decoded:
42
- response = decoded.split("<start_of_turn>assistant")[-1].strip()
43
- else:
44
- response = decoded
45
-
46
- return response
47
-
48
- gr.ChatInterface(fn=chat, title="Chat with Gemma-3").launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
+ from unsloth import FastModel
6
+
7
+ # Set environment for Hugging Face Spaces
8
+ os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
9
+
10
+ # Load the model from Hugging Face Model Hub
11
+ model_repo_id = 'adarsh3601/my_gemma3_pt'
12
+
13
+ # Load model and tokenizer using FastModel
14
+ model, tokenizer = FastModel.from_pretrained(
15
+ model_name=model_repo_id,
16
+ max_seq_length=64000,
17
+ load_in_4bit=True, # Load model with 4-bit quantization
18
+ load_in_8bit=False,
19
+ full_finetuning=False
20
+ )
21
+
22
+ # Function to generate text based on user input
23
+ def generate_text(user_input):
24
+ # Prepare the input as per the model's expected format
25
+ messages = [{
26
+ "role": "user",
27
+ "content": [{"type" : "text", "text" : user_input}]
28
+ }]
29
+
30
+ text = tokenizer.apply_chat_template(
31
  messages,
32
+ add_generation_prompt=True, # Must add for generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
+
35
+ # Generate output with model
36
+ with torch.no_grad():
37
+ output = model.generate(
38
+ **tokenizer([text], return_tensors="pt").to("cuda"),
39
+ max_new_tokens=64000, # Adjust if you need more tokens
40
+ temperature=1.0,
41
+ top_p=0.95,
42
+ top_k=64,
43
+ streamer=None # You can set a streamer if needed
44
+ )
45
+
46
+ # Decode the model output and return the result
47
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
48
+ return decoded_output
49
+
50
+ # Build the Gradio interface
51
+ iface = gr.Interface(
52
+ fn=generate_text,
53
+ inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
54
+ outputs=gr.Textbox(lines=2, placeholder="Generated text will appear here..."),
55
+ title="Gemma-3 Model",
56
+ description="This is a simple interface to interact with the Gemma-3 model. Enter a prompt and see the generated response."
57
+ )
58
+
59
+ # Launch the app
60
+ if __name__ == "__main__":
61
+ iface.launch()