import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import spacy import gradio as gr import subprocess # def download_spacy_model(model_name): # command = f"python -m spacy download {model_name}" # process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # stdout, stderr = process.communicate() # # Check if the command executed successfully # if process.returncode != 0: # print(f"An error occurred while downloading the model: {stderr.decode('utf-8')}") # else: # print(f"Successfully downloaded the model: {stdout.decode('utf-8')}") # Call the function to download the model # def find_closest(query): # files_contents = [] # files_names = [] # for file in os.listdir(): # if file.endswith(".txt"): # with open(file, 'r') as f: # content = f.read() # files_contents.append(content) # files_names.append(file) # # Append query to the end # files_contents.append(query) # # Initialize the TfidfVectorizer # tfidf_vectorizer = TfidfVectorizer() # # Fit and transform the texts # tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents) # # Compute the cosine similarity between the query and all files # similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]) # # Get the index of the file with the highest similarity score # max_similarity_idx = similarity_scores.argmax() # # Return the name of the file with the highest similarity score # return files_names[max_similarity_idx] # def find_closest(query): # try: # nlp = spacy.load('en_core_web_md') # except: # download_spacy_model('en_core_web_md') # nlp = spacy.load('en_core_web_md') # files_names = [] # files_vectors = [] # for file in os.listdir(): # if file.endswith(".txt"): # with open(file, 'r') as f: # content = f.read() # files_names.append(file) # # Get the vector representation of the content # files_vectors.append(nlp(content).vector) # # Get the vector representation of the query # query_vector = nlp(query).vector # # Compute the cosine similarity between the query and all files # similarity_scores = cosine_similarity([query_vector], files_vectors) # # Get the index of the file with the highest similarity score # max_similarity_idx = similarity_scores.argmax() # # Return the name of the file with the highest similarity score # return files_names[max_similarity_idx] def find_closest(query): files_to_exclude = ["packages.txt", "requirements.txt","pre-requirements.txt"] model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose other models files_contents = [] files_names = [] for file in os.listdir(): if file.endswith(".txt") and file not in files_to_exclude : print(f"Found .txt file: {file}") with open(file, 'r') as f: content = f.read() files_contents.append(content) files_names.append(file) # Append query to the end files_contents.append(query) # Create sentence embeddings for each text embeddings = model.encode(files_contents) # Compute the cosine similarity between the query and all files similarity_scores = cosine_similarity([embeddings[-1]], embeddings[:-1]) # Get the index of the file with the highest similarity score max_similarity_idx = similarity_scores.argmax() # Return the name of the file with the highest similarity score return files_names[max_similarity_idx] def find_closest_mp3(query): closest_txt_file = find_closest(query) file_name_without_extension, _ = os.path.splitext(closest_txt_file) return file_name_without_extension + '.mp3' my_theme = gr.Theme.from_hub("ysharma/llamas") with gr.Blocks(theme=my_theme) as demo: gr.Markdown("""