import gradio as gr import pandas as pd from uniflow.flow.client import TransformClient from uniflow.flow.config import TransformOpenAIConfig from uniflow.op.prompt import Context DEBUG = False def generate_relevant_chunks(query, input_data, progress=gr.Progress()): """ Generate relevant chunks of text from a list of papers based on a query. """ data_list = [] for paper in input_data: # progress.tqdm(input_data, desc="Papers"): data = [Context(context=query, paragraph=p) for p in paper["chunks"]] data_list.append({"paper": paper["paper"], "data": data}) instruction = """ # Task: I am a researcher trying to understand information across several research papers. You are to determine which of the chunks most directly contains information related to the query. ## Input: 1. context: A brief query or description of the information I am looking for. 2. paragraph: An paragraph from a research paper. ## Evaluation Criteria: You should pick which sentence(s) contains directly relevant information to the context. The best answer is the sentences that most directly answer or contain the information specific to the context. If there are no such sentences, you should answer with ["None"]. ## Response Format: Your response should only include two fields below: 1. explanation: Reasoning behind your judgment, explaining why the answer is appropriate or not. 2. answer: The best sentence(s) that meet the Evaluation Criteria as a list of strings. This should be ["None"] if no sentence answers the query. At most, include 3 sentences. """ few_shot_examples = [] num_thread_batch_size = 16 config = TransformOpenAIConfig() config.prompt_template.instruction = instruction config.prompt_template.few_shot_prompt = few_shot_examples config.model_config.model_name = "gpt-4-1106-preview" config.model_config.response_format = {"type": "json_object"} config.model_config.num_call = 1 config.model_config.temperature = 0.0 config.model_config.num_thread = num_thread_batch_size config.model_config.batch_size = num_thread_batch_size client = TransformClient(config) output = [] for paper in data_list: init_output = client.run(paper["data"]) combined_output = init_output[0] combined_output["output"][0]["response"][0]["explanation"] = [ combined_output["output"][0]["response"][0]["explanation"] ] if DEBUG: print(combined_output) for item in init_output[1:]: combined_output["output"][0]["response"][0]["answer"].extend( item["output"][0]["response"][0]["answer"] ) combined_output["output"][0]["response"][0]["explanation"].append( item["output"][0]["response"][0]["explanation"] ) output.append(combined_output) output_answers = [] for idx, o in enumerate(output): filtered_answers = [ item for item in o["output"][0]["response"][0]["answer"] if item != "None" ] if len(filtered_answers) == 0: filtered_answers = ["None"] output_answers.append( {"paper": input_data[idx]["paper"], "answer": filtered_answers} ) df = pd.DataFrame(output_answers) return [output_answers, df] def generate_answers(papers, queries, progress=gr.Progress()): """ Generate relevant chunks of text from a list of papers based on a list of queries. """ print(len(papers), len(queries)) output_data = [] for query in progress.tqdm(queries, desc="Queries"): [data, df] = generate_relevant_chunks(query, papers) # print("data", data) for d in data: d["query"] = query # data["query"] = query output_data.extend(data) df = create_df(output_data) return output_data, df def create_df(data): query_data = {item["query"]: {} for item in data} # Fill in query data for item in data: query = item["query"] paper = item["paper"] answer = item["answer"][0] if item["answer"] else None query_data[query][paper] = answer # Create DataFrame from the dictionary df = pd.DataFrame.from_dict(query_data, orient="index") # Reset index to include 'Queries' as a column df = df.rename_axis("Queries").reset_index() # Reorder columns so that 'Queries' is the first column cols = ["Queries"] + [col for col in df.columns if col != "Queries"] df = df[cols] return df