File size: 4,432 Bytes
070c576
 
 
 
 
 
6449689
92d0a3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6449689
 
 
 
 
 
 
 
 
 
 
 
 
 
803a58c
6449689
 
 
 
 
f132467
6449689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
070c576
6449689
 
070c576
 
 
 
 
 
6449689
070c576
6449689
070c576
6449689
 
 
 
 
070c576
6449689
 
070c576
 
 
 
7156cc6
6449689
070c576
6449689
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
import os
import pandas as pd
import numpy as np
from groq import Groq
import requests
from bs4 import BeautifulSoup




def extract_statuses(url):
    # Send a GET request to the webpage
    response = requests.get(url)

    # Parse the webpage content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all links in the webpage
    links = soup.find_all('a')

    # Identify and download the Excel file
    for link in links:
        href = link.get('href')
        if href and (href.endswith('.xls') or href.endswith('.xlsx')):
            excel_url = href if href.startswith('http') else url + href
            excel_response = requests.get(excel_url)
            file_name = 'guide_status.xlsx' #excel_url.split('/')[-1]

            # Save the file
            with open(file_name, 'wb') as f:
                f.write(excel_response.content)

            # Read the Excel file
            df = pd.read_excel(file_name)

            # Check if 'TDoc Status' column exists and extract unique statuses
            if 'TDoc Status' in df.columns:
                unique_statuses = df['TDoc Status'].unique().tolist()
                print(f'Downloaded {file_name} and extracted statuses: {unique_statuses}')


                if 'withdrawn' in unique_statuses:
                    unique_statuses.remove('withdrawn')
                return unique_statuses
            else:
                print(f"'TDoc Status' column not found in {file_name}")
                return []



def ask_llm(query, input, client_index):
    messages = [
                {
                    "role": "system",
                    "content": f"You are a helpful assistant. Only show your final response to the **User Query**! Do not provide any explanations or details: \n# User Query:\n{query}."
                },
                {
                    "role": "user",
                    "content": f"{input}",
                }
            ]

    if client_index == 0:
        client = Groq(api_key=os.environ["GROQ_API_KEY"])
        chat_completion = client.chat.completions.create(
            messages=messages,
            model='mixtral-8x7b-32768',
        )
    else:
        client = MistralClient(api_key=os.environ['MISTRAL_API_KEY'])
        chat_completion = client.chat(
            messages=messages,
            model='mistral-small-latest',
        )

    return chat_completion.choices[0].message.content

def filter_df(df, column_name, keywords):
    if len(keywords)>0:
        if column_name in df.columns:
            contains_keyword = lambda x: any(keyword.lower() in (x.lower() if type(x)==str else '') for keyword in keywords)
            filtered_df = df[df[column_name].apply(contains_keyword)]
        else:
            contains_keyword = lambda row: any(keyword.lower() in (str(cell).lower() if isinstance(cell, str) else '') for keyword in keywords for cell in row)
            filtered_df = df[df.apply(contains_keyword, axis=1)]
    else:
        filtered_df = df
    return filtered_df

def chat_with_mistral(source_cols, dest_col, prompt, excel_file, url, search_col, keywords, client):
    print(f'xlsxfile = {excel_file}')
    df = pd.read_excel(excel_file)
    df[dest_col] = ""
    try:
      file_name = url.split("/")[-2] + ".xlsx"
    except:
      file_name = excel_file
    print(f"Keywords: {keywords}")

    filtred_df = filter_df(df, search_col, keywords)

    for index, row in filtred_df.iterrows():
        concatenated_content = "\n\n".join(f"{column_name}: {str(row[column_name])}" for column_name in source_cols)
        llm_answer = ask_llm(prompt, concatenated_content, client)
        print(f"QUERY:\n{prompt}\nCONTENT:\n{concatenated_content[:200]}...\n\nANSWER:\n{llm_answer}")
        df.at[index, dest_col] = llm_answer

    df.to_excel(file_name, index=False)
    return file_name, df.head(5)

def get_columns(file):
    if file is not None:
        df = pd.read_excel(file)
        columns = list(df.columns)
        return gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns), gr.update(choices=columns + [""]), gr.update(choices=columns + ['[ALL]']), df.head(5)
    else:
        return gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), pd.DataFrame()