import tiktoken import openai import pandas as pd import numpy as np from openai.embeddings_utils import distances_from_embeddings, cosine_similarity import os import gradio as gr openai.api_key = os.environ["openai_key"] HF_API_TOKEN = os.environ["HF_API_TOKEN"] hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "answer_flags") names = ['Mulan Fan','Stanislav Stefanov','Vinicius Pedrini','Sydney Hosaka','Johnny Collins','Alex Cappy','Tiago Costa','Tarun Tampi','Derek Kedziora','Ruiqi Li','Eyal Vogel','Eline Verdonk','Samuel Hernandez','Jessica Tylkowski','Dominik Oles','Karlo Stetic','Pawel Gershkovich','Samantha van Roosenbeek','Amy Meyer','Diego Brizuela','Brandon Elsner','Jonathan McDonald','Yovani Barrera','Max Greenberg','Customer Service','Matthew Avery','Robin Brockötter','Michelle Johnsen','Tyler Hill','Caleb Baird','Benjamin Basel','Reveka Fili','Dan Scahill','Stijn van Schaik','Shane Admiraal','Simi Essien','Iulia Mihet','Yoanna Hristova','Derek Cameron','Tom Gal','Brad Burling','Jake King','Eline Verdonk','Neil Gilbody','James Miklos','Grant Fisher','Mireia Carbo Molven','Erin Richter','Max Davis','Sasha Kaliuzhnyi','Jonathan Clark','Rohini Tippannavar','Augustus Stout','Karin Kamphuis','Bram Straathof','Aaron Schleichkorn','Ford Gaitley','Ciprian Curta','Jungeun Lee','Donald Mathaa','Enrico Vaccalluzzo','Anna Filimonova','sushma Gorrepati','Alison Macbeth','Fatih Toker','Manthan Hawal','Valeriia Lashko','Las Mahmood','Anthony Giampapa','Clemence Schmidt','Gregoire Bertagnolio','Vlad Roulla','Simonas Seskevicius','michael kerr','Pedro b','Shak Akhrarov','Paul King','Nirva Portugal','The Hubs Team','Marissa Andres','Quality Control Amsterdam','Brandon Skoog','Abhay Patel','elsa tarrago','Jaime Duran','Roman Iutsis','Nicholas Sagan','Karim Youcef','Austin Daugherty','Tobias Domeier','Jesus Serrano','Andrew Lee','Emily Kelly','RDA Sourcing Team Email','Chris Chen','Josh Parker','Hashim Chishty','Brian Junk','Chelsea Turowski','Vincent Le Siou','David Zavacki','Daisy Ruiz','Marco Saviano','Diana Bonis','Chandler Hastings','Bahadir Ozdemir','Michele Immacolato','Mohamed Mustafa','Rachael Green','Aryn Thomas','Ahren Alexander','Sam Tucker','Arthur Lekic','Daniel Salazar','clara Test','Daniele Mariotti','Lilla Petri','QC Chicago','Francois Lesage','Tasha Vos','Derek Higgins','Filipe Santos','Marianna Procino','3D Hubs Team','Caleb Shadid','Clara Girardeau','Pepijn Hogt','Athina Dimitreli','Artur Akhankov','Ozgun Ogretmen','Shane Jetton','Amanda Worrell','Margarita Sheptitskaya','Diego Camelo','Sourcing Team Account','Bryce Beisswanger','Alexander Bergsma','Tommaso Gerevini','Not A Mechanical Engineer Alexander Konstantinov','marie chaproniere','Katie Guo','Régent Coridon','George Nutting','Tony Bonilla','Jackson Zartman','Su Turktas','Zhenghao Xia','hannah larson','Robert Ficken','Xiaohan Li','Alison McGarry','C Martin','Stijn van Lieshout','merritt gurley','Yuecen Li','Marcel Gallegos','Siddhant Jain','Luis Brites','Aron Abbo','Anita Ayavong','Amirmasoud Gharavian','Ibrahim Tarim','Lorenzo Musella','Rob Draaijer','Lindor Castro Valdez','It Support','Bono de Visser','Stijn Hesse','Aditya Muralidhara','Iñigo Garcia','Bastien Dupel','Artem Platonov','Jurien Groot','Heather Rivard','Jason Stewart','Fabian Thess','Agnieszka Wronkowska','Leon Sontag','Charlie Fleece','Justin Stebbins','Pjotr Horowitz','Chandrakant Isi','Zeline Jones','Anvay Goenka','test account','Matt Farr','Wilmer Milloria','DevOps 3D Hubs','Philippe Tarjan','Should Costing Team','vj - not_sourcing_manager','Teodor Sodolescu','Milton Santiago','Arnoldas Kemeklis','Tommaso Barzocchi','Jack Jackson','Francesco Rivalta','Sourav Mitra','Cagatay Tanyildiz','Logistics Support','Sam Luttmann','Luna Guimarães','Gabby VanBeaver','Barbara Bartkova','Alvin Castro','Lynnelle Mulder','Ties van Haastrecht','Emre Kilic','Amanda Miller','Daniel Loeser','Aaron Korte','Tina Strahinic','Jeremy Birch','Bilyana Kalfin','Yoram Shanan'] # ############################################################################### # ## Step 7 # ############################################################################### # # # Load the cl100k_base tokenizer which is designed to work with the ada-002 model # tokenizer = tiktoken.get_encoding("cl100k_base") # # df = pd.read_csv('processed/base_data_03-06-2023.csv', index_col=0) # df.columns = ['fname', 'text'] # # # Tokenize the text and save the number of tokens to a new column # df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) # # # Visualize the distribution of the number of tokens per row using a histogram # df.n_tokens.hist() # # ################################################################################ ### Step 8 ################################################################################ # # max_tokens = 1500 # # # # Function to split the text into chunks of a maximum number of tokens # def split_into_many(text, max_tokens=max_tokens): # # Split the text into sentences # sentences = text.split('. ') # # # Get the number of tokens for each sentence # n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] # # chunks = [] # tokens_so_far = 0 # chunk = [] # # # Loop through the sentences and tokens joined together in a tuple # for sentence, token in zip(sentences, n_tokens): # # # If the number of tokens so far plus the number of tokens in the current sentence is greater # # than the max number of tokens, then add the chunk to the list of chunks and reset # # the chunk and tokens so far # if tokens_so_far + token > max_tokens: # chunks.append(". ".join(chunk) + ".") # chunk = [] # tokens_so_far = 0 # # # If the number of tokens in the current sentence is greater than the max number of # # tokens, go to the next sentence # if token > max_tokens: # continue # # # Otherwise, add the sentence to the chunk and add the number of tokens to the total # chunk.append(sentence) # tokens_so_far += token + 1 # # return chunks # # # shortened = [] # # # Loop through the dataframe # for row in df.iterrows(): # # # If the text is None, go to the next row # if row[1]['text'] is None: # continue # # # If the number of tokens is greater than the max number of tokens, split the text into chunks # if row[1]['n_tokens'] > max_tokens: # shortened += split_into_many(row[1]['text']) # # # Otherwise, add the text to the list of shortened texts # else: # shortened.append(row[1]['text']) # # ################################################################################ # ### Step 9 # ################################################################################ # # df = pd.DataFrame(shortened, columns=['text']) # df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) # df.n_tokens.hist() ############################################################################### ## Step 10 ############################################################################### # df['embeddings'] = df.text.apply( # lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) # df.to_csv('processed/embeddings.csv') # df.head() ############################################################################### ## Step 11 ############################################################################### # df = pd.read_csv('processed/embeddings.csv', index_col=0) df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) df.head() # ################################################################################ # ### Step 12 # ################################################################################ def Formatter(blocks): html = "\n\n\n\n\n" for block in blocks: html += "
\n" paragraphs = block.split("\n") for p in paragraphs: html += f"

{p}

\n" html += "
\n" html += "\n" return html def create_context( question, df, max_len=1800, size="ada" ): """ Create a context for a question by finding the most similar context from the dataframe """ # Get the embeddings for the question q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] # Get the distances from the embeddings df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') returns = [] cur_len = 0 # Sort by distance and add the text to the context until the context is too long for i, row in df.sort_values('distances', ascending=True).iterrows(): # Add the length of the text to the current length cur_len += row['n_tokens'] + 4 # If the context is too long, break if cur_len > max_len: break # Else add it to the text that is being returned returns.append(row["text"]) returns = Formatter(returns) # Return the context return returns def answer_question( df, model="gpt-3.5-turbo", question="Am I allowed to publish model outputs to Twitter, without a human review?", max_len=1000, size="ada", debug=False, max_tokens=500, stop_sequence=None ): """ Answer a question based on the most similar context from the dataframe texts """ context = create_context( question, df, max_len=max_len, size=size, ) # If debug, print the raw model response if debug: context = context # print("Context:\n" + context) # print("\n\n") try: # Create a completions using the questin and context response = openai.ChatCompletion.create( #prompt=f"You're an automated response bot for the first reply to a customers email. Answer the question in form of an email to a customer based on the context below and provide a relevant url if its available, and if the question can't be answered based on the context, say \"I don't know and that a human will follow up soon.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", messages=[ {"role": "system", "content": f"You're an automated response bot for the first reply to a customers email. Answer the question in form of an email to a customer based on the context below and provide a relevant url if its available. Be clear and concice."}, {"role": "system", "content": f"Context: {context}\"\nQuestion: {question}. Answer in the same language content from user question.", }, {"role": "user", "content": f"{question}"} ], temperature=0.7, max_tokens=max_tokens, top_p=1, frequency_penalty=0, presence_penalty=0, stop=stop_sequence, model=model, ) answer = response["choices"][0]["message"]['content'].strip() return {'Answer': f'{answer}', 'Context': f'{context}'} except Exception as e: print(e) return "" ################################################################################ ### Step 13 ################################################################################ #list = [f"What is the organization name?",f"What is the parent company of the organization?",f"What is the address of the organization?",f"What is the telephone number of the organization?",f"What is the fax number of the organization?",f"What is the D&B number of the organization?",f"What is the website address of the organization?",f"What type of business does the organization have?",f"What is the ownership type of the organization?",f"What are the key contacts of the organization?",f"What are the NAICS codes that are applicable to the organization?",f"Does the organization use contract manufacturing?",f"If yes",f" list the name",f" address and country of origin of the contract manufacturers.",f"Does the organization have a written Code of Business Conduct?",f"If privately held",f" does the organization have a succession plan?",f"Does the organization have a business continuity plan?",f"Does the organization have a safety program?",f"Is the organization willing to share safety data?",f"Can the organization satisfy the regulatory compliance requirements for EU RoHS3",f" EU REACH and China RoHS?",f"Can the organization satisfy the regulatory compliance requirements for US Conflict of Minerals?",f"Is the organization willing to provide objective evidence for its products?",f"Is the organization willing to provide a completed and current CMRT on an annual basis?",f"Does the organization believe its sub-tier suppliers can satisfy the regulatory compliance requirements for EU RoHS3",f" EU REACH and China RoHS?",f"Is the organization working with its sub-tier suppliers to satisfy the regulatory compliance requirements for US Conflict of Minerals?",f"Does the organization have risk-based tools that evaluate severity",f" occurrence",f" and detection related to design or process (e.g. FMEAs)?",f"Is the organization willing to share relevant risk-based tools upon request?",f"Does the organization have a Quality Management System (QMS)?",f"Is the organization's QMS certified?",f"What is the certification of the organization's QMS?",f"Does the organization have an Environmental Management System"] # for i in list: # print(answer_question(df, question=f"{i}", debug=False)) question = 'What is life?' import csv def generate_leaderboard(file_path): leaderboard = {} with open(file_path, newline='') as csvfile: reader = csv.reader(csvfile, delimiter=',') header = next(reader) for row in reader: if not row: # skip empty rows continue user = row[2] # use index 2 for the 'Who' column if user in leaderboard: leaderboard[user] += 1 else: leaderboard[user] = 1 sorted_leaderboard = sorted(leaderboard.items(), key=lambda x: x[1], reverse=True) # format the leaderboard as an HTML table leaderboard_string = "" for user, count in sorted_leaderboard: leaderboard_string += f"" leaderboard_string += "
UserCount
{user}{count}
" return leaderboard_string # def answer_question_gr(input, history): # history = history or [] # s = list(sum(history, ())) # s.append(input) # inp = ' '.join(s) # answer = answer_question(df, question=f"{inp}", debug=False) # answer_text = answer['Answer'] # history.append((input, answer_text)) # context = answer['Context'] # return history, history, context def question_bot(question): answer = answer_question(df, question=f"{question}", debug=True) answer_text = answer['Answer'] context = answer['Context'] leaderboard = generate_leaderboard('flagged_data_points/log.csv') return answer_text, context, leaderboard def only_leaderboard(): leaderboard = generate_leaderboard('flagged_data_points/log.csv') return leaderboard callback = gr.CSVLogger() file_path = 'flagged_data_points/log.csv' chat = gr.Blocks(css=".gradio-container {background-color: #191919;}.row {text-align: center;} ,leaderboard {align: left;}") with chat: gr.Markdown("""

Hubs Chat

""") #chatbot = gr.Chatbot() with gr.Row(): with gr.Column(): gr.Markdown("""

1. Ask your question here:

""") display_answer = gr.Textbox(label='Answer', lines=5) message = gr.Textbox(label='Ask you Question here?') examples = gr.Examples( examples=["How do I reach out to Hubs?", "What technologies do you offer and which tollerances are available?"], inputs=[message]) #state = gr.Variable() submit = gr.Button("SEND") with gr.Column(): gr.Markdown("""

2. Submit your correction here:

""") #who = gr.Dropdown(names, label="Who answered?") who = gr.Textbox(label='Who answered?', lines=1) correction = gr.Textbox(label='Correction', lines=5) with gr.Row(): ok = gr.Button("Report!") gr.Markdown("""

These buttons give no feedback, but if you cliked them - it worked!

""") with gr.Row(): with gr.Column(): gr.Markdown("""

Leaderboard:

""") leaderboard = gr.HTML(elem_id="leaderboard") gr.Markdown("""

Context:

""") gr.Markdown("""

The model found these answers the most relevant (sorted by relevance).It used the most relevant ones answer the question

""") context =gr.HTML() callback.setup([message, display_answer, who, correction], "flagged_data_points") submit.click(question_bot, inputs=[message], outputs=[display_answer, context], api_name='apicall') ok.click(lambda *args: [callback.flag(args),only_leaderboard()], [message, display_answer, who,correction, ok],outputs=[leaderboard], api_name='flagging_2',preprocess=False) chat.launch()