Hhhgg / app.py
Hjgugugjhuhjggg's picture
Update app.py
1e6f7d7 verified
import gc
import psutil
import os
import torch
from fastapi import FastAPI
from langchain.llms import VLLM
from cachetools import TTLCache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import asyncio
import torch.nn.utils.prune as prune
from concurrent.futures import ThreadPoolExecutor
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
nltk.download('punkt')
nltk.download('stopwords')
app = FastAPI()
model_1 = None
model_2 = None
model_3 = None
model_4 = None
# Using TTLCache from cachetools
cache_1 = TTLCache(maxsize=100, ttl=600) # maxsize=100 and ttl=600 (10 minutes)
cache_2 = TTLCache(maxsize=100, ttl=600)
cache_3 = TTLCache(maxsize=100, ttl=600)
cache_4 = TTLCache(maxsize=100, ttl=600)
previous_responses_1 = []
previous_responses_2 = []
previous_responses_3 = []
previous_responses_4 = []
MAX_TOKENS = 2048
executor = ThreadPoolExecutor(max_workers=4)
# Configuración para usar solo la CPU
device = torch.device("cpu")
def get_best_response(new_response, previous_responses):
if not previous_responses:
return new_response
vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
max_sim_index = cosine_sim.argmax()
max_sim_score = cosine_sim[0][max_sim_index]
if max_sim_score > 0.7:
return previous_responses[max_sim_index]
return new_response
def summarize_text(text):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))
word_frequencies = Counter()
for sentence in sentences:
words = word_tokenize(sentence.lower())
words = [word for word in words if word.isalpha() and word not in stop_words]
word_frequencies.update(words)
most_common_words = word_frequencies.most_common(50)
most_common_words = {word: freq for word, freq in most_common_words}
ranked_sentences = []
for sentence in sentences:
score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
ranked_sentences.append((score, sentence))
ranked_sentences.sort(reverse=True, key=lambda x: x[0])
summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
return summary
def clear_memory():
gc.collect()
process = psutil.Process(os.getpid())
memory_usage = psutil.virtual_memory().percent
if memory_usage > 90:
global model_1, model_2, model_3, model_4
model_1 = None
model_2 = None
model_3 = None
model_4 = None
gc.collect()
def apply_pruning(model):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.random_unstructured(module, name="weight", amount=0.2)
prune.remove(module, name="weight")
return model
def split_input(input_text, max_tokens):
tokens = input_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def split_output(output_text, max_tokens):
tokens = output_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
vllm_llm = VLLM(model_name=model_name, device=device)
template = """
You are a helpful assistant. Given the following text, generate a meaningful response:
{input_text}
"""
prompt = PromptTemplate(input_variables=["input_text"], template=template)
chain = LLMChain(llm=vllm_llm, prompt=prompt)
def generate_for_model(input_text):
cached_output = cache.get(input_text)
if cached_output:
return cached_output
input_chunks = split_input(input_text, MAX_TOKENS)
output_text = ""
prev_output = ""
for chunk in input_chunks:
prompt = prev_output + chunk
output_text += chain.run(input_text=prompt)
prev_output = output_text.split()[-50:]
output_chunks = split_output(output_text, MAX_TOKENS)
best_response = get_best_response(output_chunks[0], previous_responses)
cache[input_text] = best_response
previous_responses.append(best_response)
return best_response
return generate_for_model
async def load_models():
global model_1, model_2, model_3, model_4
model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
print("Modelos cargados exitosamente.")
async def optimize_models_periodically():
while True:
await load_models()
await asyncio.sleep(3600)
@app.on_event("startup")
async def startup():
await load_models()
app.add_event_handler("startup", monitor_memory)
app.add_event_handler("startup", optimize_models_periodically)
async def monitor_memory():
while True:
clear_memory()
await asyncio.sleep(60)
@app.get("/generate")
async def generate_response(model_name: str, input_text: str):
if model_name == "model1":
result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
elif model_name == "model2":
result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
elif model_name == "model3":
result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
elif model_name == "model4":
result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
else:
return {"error": "Model not found"}
return {f"{model_name}_output": result}
@app.get("/unified_summary")
async def unified_summary(input_text: str):
output1 = await generate_response(model_name="model1", input_text=input_text)
output2 = await generate_response(model_name="model2", input_text=input_text)
output3 = await generate_response(model_name="model3", input_text=input_text)
output4 = await generate_response(model_name="model4", input_text=input_text)
combined_response = output1.get("model1_output", "") + " " + \
output2.get("model2_output", "") + " " + \
output3.get("model3_output", "") + " " + \
output4.get("model4_output", "")
summarized_response = summarize_text(combined_response)
return {"summary": summarized_response}