import gradio as gr import pandas as pd import os import spacy import spacy.cli import numpy as np import zipfile import tempfile from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from keras.preprocessing.text import text_to_word_sequence import openai import re #vendor name (from column 0) # Put your OpenAI API key here openai.api_key = os.getenv('API') try: nlp = spacy.load('en_core_web_md') except IOError: print("Model not found. Downloading...") spacy.cli.download("en_core_web_md") nlp = spacy.load('en_core_web_md') vendor_df_dict = {} # A dictionary to store the total Trellis Score of each vendor total_trellis_scores = {} def preprocess_text(text): if isinstance(text, str): text = text.lower() filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' text = text.translate(str.maketrans('', '', filters)) stop_words = ['does', 'the', 'offer', 'do', 'you', 'require'] words = text_to_word_sequence(text) words = [word for word in words if word not in stop_words] return ' '.join(words) else: return text def semantic_search(query, data): query = str(query) data = [str(text) for text in data] query = preprocess_text(query) data = [preprocess_text(text) for text in data] vectorizer = TfidfVectorizer().fit_transform([query] + data) cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten() return np.argmax(cosine_similarities[1:]) def parse_score(score): level_scores = { 'Level 1 - Basic': 1, 'Level 2 - Developing': 2, 'Level 3 - Intermediate': 3, 'Level 4 - Advanced': 4, 'Level 5 - Leading': 5, '1 - Low': 1, '2 - Below average': 2, '3 - Average': 3, '4 - Above average': 4, '5 - High': 5, '1 - Very Low': 1, '2 - Low': 2, '3 - Medium': 3, '4 - High-Medium': 4, '5 - Very High': 5 } if score is None or str(score).strip() == '': return 3 if isinstance(score, str): score = score.replace(',', '.') if score in level_scores: return level_scores[score] else: number = re.findall(r"[-+]?\d*\.\d+|\d+", score) if number: return float(number[0]) return 0 def load_vendor_files(zip_file_path): global vendor_df_dict with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: temp_dir = tempfile.TemporaryDirectory() zip_ref.extractall(temp_dir.name) for file_name in os.listdir(temp_dir.name): if file_name.endswith(".xlsx"): vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name)) vendor_df_dict[file_name] = vendor_df_dict[file_name][vendor_df_dict[file_name].iloc[:, 2] != 'No'] # Exclude rows with 'No' in the 3rd column vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text) vendor_df_dict[file_name].iloc[:, 4].fillna('3 - Medium', inplace=True) vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float) vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0) temp_dir.cleanup() def process_file(client_file): zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip') load_vendor_files(zip_file_path) # Load vendor files from zip file client_df = pd.read_excel(client_file.name) client_df.iloc[:, 2].fillna('3 - Medium', inplace=True) client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) #return numbr only from client score client_df = client_df[client_df.iloc[:, 1] == 'Yes'] client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text) client_df['score_client'] = client_df['score_client'].astype(float) matches_found = False # Flag to check if any matches were found highest_score_vendor = None highest_score = 0 total_scores = {} # dictionary to store the total Trellis Score for each vendor with pd.ExcelWriter('matches.xlsx') as writer: common_list = [] for vendor_file, vendor_df in vendor_df_dict.items(): for index, row in client_df.iterrows(): most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2]) most_similar_row = vendor_df.iloc[most_similar_index, :] client_score = row['score_client'] # directly access the pre-parsed client score vendor_score = parse_score(most_similar_row[4]) client_row_selected = row[[0, 2, 3]] vendor_row_selected = most_similar_row[[0, 2, 4, 5]] combined_row = pd.concat([client_row_selected, vendor_row_selected]) trellis_score = client_score * vendor_score combined_row['Trellis Score'] = trellis_score common_list.append(combined_row) if trellis_score > highest_score: highest_score = trellis_score highest_score_vendor = vendor_file if common_list: common_df = pd.DataFrame(common_list) # Compute the total Trellis Score total_trellis_score = common_df['Trellis Score'].sum() total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor # Add a row with the total Trellis Score to the DataFrame common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns) common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False) common_list = [] # Reset the common_list for next vendor_file matches_found = True # Set the flag to True as matches were found top_three_vendors = sorted(total_trellis_scores, key=total_trellis_scores.get, reverse=True)[:3] top_three_vendors = [vendor.split(".")[0] for vendor in top_three_vendors] if not matches_found: # In case there were no matches return "No matching data found.", None, None else: def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=3000, temperature=0.3): try: response = openai.ChatCompletion.create( model=engine, messages=[ {"role": "system", "content": "You are a helpful AI."}, {"role": "user", "content": prompt} ], max_tokens=max_tokens, temperature=temperature ) return response['choices'][0]['message']['content'].strip() except Exception as e: print(f"Error in gpt3_query: {str(e)}") return None # Get GPT-3.5-turbo to create a summary text summary = gpt3_query(f"""Based on the Trellis Score, the top three vendors are {', '.join(top_three_vendors)}. We have analyzed the performance of several vendors and identified the top three based on their Trellis Scores. Here are the key data points for each: 1. [name vendor 1] - Trellis Score: {total_trellis_scores} - Strengths: [brief summary of strengths] - Weaknesses: [brief summary of weaknesses] - Key Features: [brief summary of key features] 2. [name vendor 2] - Strengths: [brief summary of strengths] - Weaknesses: [brief summary of weaknesses] - Key Features: [brief summary of key features] 3. [name vendor 3] - Strengths: [brief summary of strengths] - Weaknesses: [brief summary of weaknesses] - Key Features: [brief summary of key features] Based on this format, please provide a detailed breakdown of the strengths and weaknesses of each vendor, along with an overall comparison. """) return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), top_three_vendors iface = gr.Interface( fn=process_file, inputs=[gr.components.File(label="Client File")], outputs=[ gr.components.Textbox(label="Status"), gr.components.File(label="Download Match Results"), gr.components.Textbox(label="Top Three Vendors") ], ) iface.launch()