import os import openai import re from os.path import splitext, exists import nltk from nltk.tokenize import word_tokenize import gradio as gr import backoff import markdown from docx import Document from io import StringIO from datetime import datetime import tempfile nltk.download('punkt') openai.api_key = os.getenv("OPENAI_API_KEY") def clean_webvtt(filepath: str) -> str: """Clean up the content of a subtitle file (vtt) to a string Args: filepath (str): path to vtt file Returns: str: clean content """ # read file content with open(filepath, "r", encoding="utf-8") as fp: content = fp.read() # remove header & empty lines lines = [line.strip() for line in content.split("\n") if line.strip()] lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines # remove indexes lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()] # remove tcode #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}') pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d' lines = [lines[i] for i in range(len(lines)) if not re.match(pattern, lines[i])] # remove timestamps pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$" lines = [lines[i] for i in range(len(lines)) if not re.match(pattern, lines[i])] content = " ".join(lines) # remove duplicate spaces pattern = r"\s+" content = re.sub(pattern, r" ", content) # add space after punctuation marks if it doesn't exist pattern = r"([\.!?])(\w)" content = re.sub(pattern, r"\1 \2", content) return content def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str: """Save clean content of a subtitle file to text file Args: file_in (str): path to vtt file file_out (None, optional): path to text file **kwargs (optional): arguments for other parameters - no_message (bool): do not show message of result. Default is False Returns: str: path to text file """ # set default values no_message = kwargs.get("no_message", False) if not file_out: filename = splitext(file_in)[0] file_out = "%s.txt" % filename i = 0 while exists(file_out): i += 1 file_out = "%s_%s.txt" % (filename, i) content = clean_webvtt(file_in) with open(file_out, "w+", encoding="utf-8") as fp: fp.write(content) if not no_message: print("clean content is written to file: %s" % file_out) return file_out def break_up_file(tokens, chunk_size, overlap_size): if len(tokens) <= chunk_size: yield tokens else: chunk = tokens[:chunk_size] yield chunk yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size) def break_up_file_to_chunks(filename, chunk_size=3000, overlap_size=100): with open(filename, 'r') as f: text = f.read() tokens = word_tokenize(text) return list(break_up_file(tokens, chunk_size, overlap_size)) def convert_to_prompt_text(tokenized_text): #elimina de la lista los elementos de los strings que tengan al menos 3 números en cualquier lugar del string tokenized_text = [x for x in tokenized_text if not any(c.isdigit() for c in x)] prompt_text = " ".join(tokenized_text) prompt_text = prompt_text.replace(" 's", "'s") return prompt_text @backoff.on_exception(backoff.expo, openai.error.RateLimitError) @backoff.on_exception(backoff.expo, openai.error.APIConnectionError) def summarize_meeting(filepath): filename = filepath print(filepath) prompt_response = [] # Break the text of the meeting transcripts into chunks. chunks = break_up_file_to_chunks(filename) # Summarize each chunk. # Resumir cada fragmento. for i, chunk in enumerate(chunks): print(i) print(chunk) prompt_request = convert_to_prompt_text(chunk) print(prompt_request) prompt_request = "Resume brevemente esta transcripción de la reunión en el mismo idioma que la entrada del usuario: " + prompt_request response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": prompt_request} ], temperature=.3 ) prompt_response.append(response["choices"][0]["message"]['content'].strip()) # Consolidar estos resúmenes de la reunión. consolidated_summary = [] for summary in prompt_response: prompt_request = "Resume el siguiente texto: " + summary response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "user", "content": prompt_request} ], temperature=.1, top_p=1, frequency_penalty=0, presence_penalty=0 ) consolidated_summary.append(response["choices"][0]["message"]['content'].strip()) # Consolidar el resumen usando GPT-4 final_summary_request = " ".join(consolidated_summary) response = openai.ChatCompletion.create( model="gpt-4", messages=[{"role": "system", "content": "Consolidar y resumir el texto de las transcripciones de la reunión. El formato de salida debe ser markdown en el mismo idioma que la entrada del usuario. Comenzar con un resumen breve de la reunión, continuar con puntos destacados que describan los aspectos más importantes de la discusión. Finalmente, proporcionar una tabla para mostrar la lista de acciones con 3 columnas: Acción, Persona Asignada, Fecha de Vencimiento."}, {"role": "user", "content": final_summary_request} ], temperature=.1, top_p=1, frequency_penalty=0, presence_penalty=0 ) final_summary = response["choices"][0]["message"]['content'].strip() return final_summary def summarize_meeting_vtt(file): temp_file_path = file.name summary_text = summarize_meeting(temp_file_path) return summary_text demo = gr.Interface( fn=summarize_meeting_vtt, # input inputs=gr.File(label="Archivo .vtt"), # output outputs=[ gr.Markdown(label="Resumen de la reunión") ], title="Minuteevo - Ayudante para Minutas", description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.") if __name__ == "__main__": demo.launch()