jsakshi commited on
Commit
6494de8
·
verified ·
1 Parent(s): 9f8d6cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -14
app.py CHANGED
@@ -1,21 +1,96 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import torch
 
3
 
4
- # Load the model and tokenizer
5
- model_name = "describeai/gemini" # Replace with the actual Gemini model if available on HF
 
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
- model = AutoModelForCausalLM.from_pretrained(model_name)
8
 
9
- # Define input text
10
- input_text = "Explain the Python function below:\n\ndef add(a, b):\n return a + b"
 
 
 
11
 
12
- # Tokenize input
13
- inputs = tokenizer(input_text, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Generate response
16
- with torch.no_grad():
17
- output = model.generate(**inputs, max_length=100)
18
 
19
- # Decode and print result
20
- response = tokenizer.decode(output[0], skip_special_tokens=True)
21
- print("Model Output:", response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
 
4
+ # CPU-friendly model settings
5
+ # Using a smaller model with quantization for CPU compatibility
6
+ model_name = "google/gemma-2-2b" # Smaller 2B parameter model
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
8
 
9
+ # Configure quantization for better CPU performance
10
+ quantization_config = BitsAndBytesConfig(
11
+ load_in_4bit=True,
12
+ bnb_4bit_compute_dtype=torch.float16
13
+ )
14
 
15
+ # Load model with CPU optimizations
16
+ try:
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_name,
19
+ quantization_config=quantization_config,
20
+ device_map="auto" # Will use CPU if no GPU is available
21
+ )
22
+ using_quantization = True
23
+ except Exception as e:
24
+ print(f"Quantization failed with error: {e}")
25
+ print("Falling back to standard CPU loading...")
26
+ model = AutoModelForCausalLM.from_pretrained(
27
+ model_name,
28
+ torch_dtype=torch.float32, # Use float32 for CPU
29
+ device_map="cpu" # Explicitly use CPU
30
+ )
31
+ using_quantization = False
32
 
33
+ print(f"Model loaded on CPU. Using quantization: {using_quantization}")
34
+ print(f"Model size: {model_name}")
 
35
 
36
+ # Define a function to generate text with adjusted parameters for CPU
37
+ def generate_response(prompt, max_length=200): # Reduced max length
38
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
39
+
40
+ print("Generating response (this may take a while on CPU)...")
41
+ start_time = time.time()
42
+
43
+ # Generate output with more conservative settings for CPU
44
+ outputs = model.generate(
45
+ **inputs,
46
+ max_new_tokens=max_length,
47
+ do_sample=False, # Deterministic generation is faster
48
+ num_beams=1, # No beam search for speed
49
+ )
50
+
51
+ end_time = time.time()
52
+ print(f"Generation completed in {end_time - start_time:.2f} seconds")
53
+
54
+ # Decode and return the generated text
55
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
56
+
57
+ # Remove the prompt from the response
58
+ return generated_text[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):]
59
+
60
+ # Test the model with simpler, shorter prompts for CPU evaluation
61
+ import time
62
+
63
+ test_prompts = [
64
+ "Explain what machine learning is in one paragraph.",
65
+ "Write a haiku about computers.",
66
+ "List three benefits of open-source software."
67
+ ]
68
+
69
+ # Run evaluation
70
+ print("\nEvaluating Gemini open source model on CPU\n")
71
+ print("=" * 50)
72
+
73
+ for i, prompt in enumerate(test_prompts):
74
+ print(f"\nPrompt {i+1}: {prompt}")
75
+ print("-" * 50)
76
+
77
+ start_time = time.time()
78
+ response = generate_response(prompt)
79
+ end_time = time.time()
80
+
81
+ print(f"Response time: {end_time - start_time:.2f} seconds")
82
+ print(f"Response:\n{response}")
83
+ print("=" * 50)
84
+
85
+ # Memory usage information
86
+ import psutil
87
+ process = psutil.Process()
88
+ memory_info = process.memory_info()
89
+ print(f"\nMemory Usage: {memory_info.rss / (1024 * 1024):.2f} MB")
90
+
91
+ # Save model output to a file for later analysis
92
+ with open("gemini_cpu_evaluation_results.txt", "w") as f:
93
+ f.write("GEMINI MODEL CPU EVALUATION RESULTS\n\n")
94
+ for i, prompt in enumerate(test_prompts):
95
+ f.write(f"Prompt {i+1}: {prompt}\n")
96
+ f.write(f"Response:\n{generate_response(prompt)}\n\n")