File size: 4,978 Bytes
08153b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""
RESON-LLAMA Chat con MEMORIA CONVERSAZIONALE - PULIZIA MINIMALE
"""

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import warnings
import re

warnings.filterwarnings("ignore", category=UserWarning)

conversation_turns = []
MAX_MEMORY_TURNS = 4

def load_reson_model(model_path=r"C:\Users\dacan\OneDrive\Desktop\Meta\Reson4.5\Reson4.5"):
    print(f"๐Ÿง  Caricamento RESON-LLAMA da {model_path}...")
    
    base_model_name = "meta-llama/Llama-2-7b-chat-hf"
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False,
        low_cpu_mem_usage=True
    )
    
    model = PeftModel.from_pretrained(base_model, model_path)
    
    print("โœ… RESON-LLAMA V4 caricato con memoria!")
    return model, tokenizer

def minimal_clean_response(response):
    """Pulizia MINIMALE - rimuove tutto tra parentesi quadre"""
    
    # Rimuovi QUALSIASI cosa tra parentesi quadre [...]
    cleaned = re.sub(r'\[.*?\]', '', response)
    
    # Pulizia spazi multipli
    cleaned = re.sub(r'[ \t]+', ' ', cleaned)
    cleaned = re.sub(r' *\n *', '\n', cleaned)
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    cleaned = cleaned.strip()
    
    return cleaned

def format_conversation_prompt(conversation_turns, current_question):
    prompt_parts = []
    
    for turn in conversation_turns[-MAX_MEMORY_TURNS:]:
        prompt_parts.append(f"[INST] {turn['question']} [/INST] {turn['answer']}")
    
    prompt_parts.append(f"[INST] {current_question} [/INST]")
    
    full_prompt = " ".join(prompt_parts)
    return full_prompt

def generate_response(model, tokenizer, prompt):
    inputs = tokenizer(
        prompt, 
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    input_length = inputs['input_ids'].shape[1]
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.60,
            do_sample=True,
            top_p=0.94,
            top_k=40,
            min_p=0.05,
            repetition_penalty=1.15,
            no_repeat_ngram_size=3,
            min_length=60,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    new_tokens = outputs[0][input_length:]
    raw_response = tokenizer.decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
    
    # Pulizia minimale - mantieni tutto il contenuto interessante
    clean_response = minimal_clean_response(raw_response)
    
    return clean_response

def chat_with_memory(model, tokenizer):
    global conversation_turns
    conversation_turns = []
    
    print("\n๐Ÿง  RESON-LLAMA V4 CHAT CON MEMORIA")
    print("Comandi: 'quit' = esci, 'clear' = cancella memoria")
    
    while True:
        try:
            user_input = input(f"\n๐Ÿง‘ Tu: ").strip()
            
            if user_input.lower() == 'quit':
                print("๐Ÿ‘‹ Arrivederci!")
                break
                
            elif user_input.lower() == 'clear':
                conversation_turns = []
                print("๐Ÿง  Memoria cancellata!")
                continue
            
            if not user_input:
                continue
            
            print("๐Ÿง  RESON sta riflettendo...")
            
            prompt = format_conversation_prompt(conversation_turns, user_input)
            response = generate_response(model, tokenizer, prompt)
            
            print(f"\n๐Ÿค– RESON: {response}")
            
            conversation_turns.append({
                'question': user_input,
                'answer': response
            })
            
            if len(conversation_turns) > MAX_MEMORY_TURNS:
                conversation_turns = conversation_turns[-MAX_MEMORY_TURNS:]
            
        except KeyboardInterrupt:
            print("\n๐Ÿ‘‹ Chat interrotta!")
            break
        except Exception as e:
            print(f"โŒ Errore: {e}")

def main():
    print("๐Ÿง  RESON-LLAMA V4 CON MEMORIA")
    
    model, tokenizer = load_reson_model()
    chat_with_memory(model, tokenizer)

if __name__ == "__main__":
    main()