File size: 3,156 Bytes
82b1c50
ee2590c
82b1c50
 
39dd6f6
82b1c50
39dd6f6
82b1c50
ee2590c
 
 
 
 
 
 
 
39dd6f6
82b1c50
ee2590c
 
 
 
 
82b1c50
 
ee2590c
 
39dd6f6
 
 
 
 
 
82b1c50
ee2590c
 
 
 
 
82b1c50
 
39dd6f6
 
ee2590c
 
 
 
 
 
82b1c50
39dd6f6
 
 
 
 
ee2590c
 
 
39dd6f6
ee2590c
 
 
 
39dd6f6
ee2590c
 
 
 
 
 
82b1c50
 
 
 
 
 
 
39dd6f6
 
 
 
 
 
 
 
 
ee2590c
39dd6f6
82b1c50
39dd6f6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
import torch

# Initialize model and tokenizer
MODEL_NAME = "TheBloke/deepseek-coder-1.3b-instruct-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Configure GPTQ for inference
quantization_config = GPTQConfig(
    bits=4,          # 4-bit quantization
    dataset="c4",    # Required dummy dataset for config
    model_seqlen=2048  # Match model's maximum context length
)

# Load model with CPU optimizations
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    torch_dtype=torch.float32,  # CPU-friendly precision
    low_cpu_mem_usage=True,
    offload_folder="offload",   # Disk offloading for large layers
    offload_state_dict=True     # Memory-efficient state loading
)

def generate_text(prompt, max_length=150, temperature=0.7):
    """Generate text with optimized inference settings"""
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=1,          # Single-beam for minimal memory
            do_sample=True,       # Enable sampling for creativity
            top_p=0.95,           # Nucleus sampling
            repetition_penalty=1.1  # Reduce repetition
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Gradio Interface with Enhanced UX
with gr.Blocks(theme="soft", css=".gr-box {border-radius: 10px}") as demo:
    gr.Markdown("""
    # 🧠 DeepSeek Coder 1.3B Text Generator
    *Optimized for CPU execution on HuggingFace Free Tier*
    """)
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Input Prompt",
                placeholder="Enter your programming/code-related question...",
                lines=5,
                max_lines=10,
                elem_classes=["monospace"]
            )
            with gr.Row():
                max_length = gr.Slider(50, 500, value=150, label="Max Length", step=10)
                temperature = gr.Slider(0.1, 1.0, value=0.7, label="Creativity", step=0.05)
            submit = gr.Button("πŸš€ Generate", variant="primary")
        
        output = gr.Textbox(
            label="Generated Output",
            lines=12,
            max_lines=20,
            elem_classes=["monospace"]
        )
    
    submit.click(
        fn=generate_text,
        inputs=[prompt, max_length, temperature],
        outputs=output
    )
    
    gr.Examples(
        examples=[
            ["Write a Python function to calculate Fibonacci numbers"],
            ["Explain the difference between list and tuples in Python"],
            ["Create a simple Flask API endpoint for user registration"]
        ],
        fn=generate_text,
        inputs=[prompt, max_length, temperature],
        outputs=output,
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()