menarini

Sleeping

File size: 7,528 Bytes

2fb0169
 
152dfa1
2fb0169
 
 
36aeeec
 
152dfa1
2fb0169
 
08152a0
 
6fb6db8
 
 
08152a0
 
 
 
 
 
6fb6db8
08152a0
 
 
 
 
 
 
 
 
6fb6db8
 
 
08152a0
 
 
 
152dfa1
9a8ebcf
2fb0169
 
9a8ebcf
6fb6db8
 
9a8ebcf
152dfa1
3351f20
08152a0
9a8ebcf
 
2fb0169
9a8ebcf
2fb0169
9a8ebcf
4c78198
 
 
 
 
 
 
 
 
 
 
 
 
6fb6db8
 
6db8557
7cc108b
9d2951a
6fb6db8
4c78198
2fb0169
 
6fb6db8
 
 
 
 
 
4c78198
 
 
 
 
 
 
 
 
 
 
 
 
6fb6db8
 
 
 
 
 
6db8557
2fb0169
9a8ebcf
6fb6db8
2fb0169
 
71a34b2
9a8ebcf
6fb6db8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fb0169
 
 
 
9a8ebcf
 
71a34b2
2fb0169
 
9a8ebcf

import pandas as pd
import requests
from pydantic import Field, BaseModel

from omegaconf import OmegaConf

from vectara_agentic.agent import Agent
from vectara_agentic.tools import ToolsFactory, VectaraToolFactory

initial_prompt = "How can I help you today?"

prompt = """
[
  {"role": "system", "content": "You are an AI assistant that forms a coherent answer to a user query based on search results that are provided to you." },
  {"role": "user", "content": " 
    [INSTRUCTIONS]
    If the search results are irrelevant to the question respond with *** I do not have enough information to answer this question.***
    Search results may include tables in a markdown format. When answering a question using a table be careful about which rows and columns contain the answer and include all relevant information from the relevant rows and columns that the query is asking about.
    Do not base your response on information or knowledge that is not in the search results.
    Make sure your response is answering the query asked. If the query is related to an entity (such as a person or place), make sure you use search results related to that entity.
    Consider that each search result is a partial segment from a bigger text, and may be incomplete.
    Your output should always be in a single language - the $vectaraLangName language. Check spelling and grammar for the $vectaraLangName language.
    Search results for the query *** $vectaraQuery***, are listed below, some are text, some MAY be tables in markdown format.
    #foreach ($qResult in $vectaraQueryResultsDeduped)
      [$esc.java($foreach.index + 1)]
      #if($qResult.hasTable())
        Table Title: $qResult.getTable().title() || Table Description: $qResult.getTable().description() || Table Data:
        $qResult.getTable().markdown()
      #else
        $qResult.getText()
      #end
    #end
    Generate a coherent response (but no more than $vectaraOutChars characters) to the query *** $vectaraQuery *** using information and facts in the search results provided. 
    Give a slight preference to search results that appear earlier in the list.
    Include statistical and numerical evidence to support and contextualize your response.
    Only cite relevant search results in your answer following these specific instructions: $vectaraCitationInstructions
    If the search results are irrelevant to the query, respond with ***I do not have enough information to answer this question.***. Respond always in the $vectaraLangName language, and only in that language."}
]
"""

def create_assistant_tools(cfg):


    class QueryPublicationsArgs(BaseModel):
        query: str = Field(..., description="The user query, always in the form of a question", 
                           examples=["what are the risks reported?", "which drug was use on the and how big was the population?"])        
        
    vec_factory = VectaraToolFactory(vectara_api_key=cfg.api_key,
                                     vectara_corpus_key=cfg.corpus_key)
    summarizer = 'vectara-summary-table-md-query-ext-jan-2025-gpt-4o'
    ask_publications = vec_factory.create_rag_tool(
        tool_name = "ask_publications",
        tool_description = """
        Responds to an user question about a particular result, based on the publications.
        """,
        tool_args_schema = QueryPublicationsArgs,
#        reranker = "multilingual_reranker_v1", rerank_k = 100,
        reranker = "chain", rerank_k = 100, 
        rerank_chain = [
            {
                "type": "multilingual_reranker_v1",
#                "cutoff": 0.2
            },
            {
                "type": "mmr",
                "diversity_bias": 0.2,
                "limit": 50
            }
        ],
        n_sentences_before = 2, n_sentences_after = 2, lambda_val = 0.005,
        summary_num_results = 15,
        vectara_summarizer = summarizer,
        include_citations = True,
        vectara_prompt_text=prompt,
        save_history = True,
        verbose=False
    )

    search_publications = vec_factory.create_search_tool(
        tool_name = "search_publications",
        tool_description = """
        Returns matching publications to a user query.
        """,
        tool_args_schema = QueryPublicationsArgs,
        reranker = "chain", rerank_k = 100, 
        rerank_chain = [
            {
                "type": "multilingual_reranker_v1",
#                "cutoff": 0.2
            },
            {
                "type": "mmr",
                "diversity_bias": 0.2,
                "limit": 50
            }
        ],
#        reranker = "multilingual_reranker_v1", rerank_k = 100,
        n_sentences_before = 2, n_sentences_after = 2, lambda_val = 0.005,
        save_history = True,
        verbose=True
    )


    tools_factory = ToolsFactory()
    return (
            tools_factory.standard_tools() +
            [ask_publications, search_publications]
    )

def initialize_agent(_cfg, agent_progress_callback=None):
    menarini_bot_instructions = """
    - You are an expert statistician and clinical trial data analyst with extensive experience in designing, analyzing, and interpreting clinical research data.
    - Your responses should be technically rigorous, data-driven, and written for an audience familiar with advanced statistical methodologies, regulatory standards, and the nuances of clinical trial design. 
    - Call the ask_publications tool to retreive information to answer the user query.
      If the initial query lacks comprehensive data, continue to query ask_publications with refined search parameters until you retrieve all necessary numerical details
    - Call the search_publications tool to retreive a list of publications that may contain the information needed to answer the user query.
      The results include the document_id of each publication, and metadata.
    - When responding to queries:
    1) Use precise statistical terminology (e.g., randomization, blinding, intention-to-treat, type I/II error, p-values, confidence intervals, Bayesian methods, etc.) 
       and reference common methodologies or guidelines where applicable (e.g., CONSORT, FDA, EMA).
    2) Your responses must include contextual information such as sample size and population characteristics. This nuance is crucial in clinical trial analysis.
       When considering or reporting sample sizes, consider participants who were eligible for the study, those who were randomized, and those who completed the study.
       If it's unclear which one is being referred to, clarify this in your response or ask the user for clarification.
    3) Provide clear explanations of statistical concepts, including assumptions, potential biases, and limitations in the context of clinical trial data.
    4) Ensure that your analysis is evidence-based and reflects current best practices in the field of clinical research and data analysis.
    5) Before finalizing your answer, review the analysis to ensure that all relevant data has been incorporated and that your conclusions are well-supported by the evidence.
    6) Provide sources and citations for all data and statistical information included in your responses, as provided in the response from the tools.
    """

    agent = Agent(
        tools=create_assistant_tools(_cfg),
        topic="Drug trials publications",
        custom_instructions=menarini_bot_instructions,
        agent_progress_callback=agent_progress_callback,
    )
    agent.report()
    return agent