File size: 2,930 Bytes
734db66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444fbc0
734db66
 
 
 
 
 
 
 
 
 
 
 
444fbc0
33a6d1c
734db66
 
 
89ff17f
 
734db66
444fbc0
 
734db66
33a6d1c
444fbc0
00b67ab
 
734db66
 
 
 
6da8cb0
734db66
4460317
734db66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444fbc0
734db66
5f90b6d
734db66
 
 
 
 
 
 
 
 
 
 
f6d3844
734db66
444fbc0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain import PromptTemplate
import re
import pandas as pd
from langchain.vectorstores import FAISS
import requests
from typing import List
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI

from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any

import ast
from utils import ClaudeLLM

embeddings = HuggingFaceEmbeddings()
db = FAISS.load_local('db_full', embeddings)
mp_docs = {}

def retrieve_thoughts(query, n):

    # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
    docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
    df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
    df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
    df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
    df['_id'] = df['_id'].apply(lambda x: str(x))
    df.sort_values("score", inplace = True)

  # TO-DO: What if user query doesn't match what we provide as documents

    tier_1 = df[df['score'] < 1]
    

    chunks_1 = tier_1.groupby(['_id' ]).apply(lambda x: {f"chunk_{i}": row for i, row  in enumerate(x.sort_values('id')[['id', 'score','page_content']].to_dict('records'))}).values
    tier_1_adjusted = tier_1.groupby(['_id']).first().reset_index()[['_id', 'title', 'url', 'score']]
    tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
    tier_1_adjusted['chunks'] = chunks_1
    score = tier_1.groupby(['_id' ]).apply(lambda x: x['score'].mean()).values
    tier_1_adjusted['score'] = score
    tier_1_adjusted.sort_values("score", inplace = True)

    if n:
      tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]

    return {'tier 1':tier_1_adjusted, }

def qa_retrieve(query,):

    docs = ""

    global db
    print(db)

    global mp_docs
    thoughts = retrieve_thoughts(query, 0)
    if not(thoughts):

        if mp_docs:
            thoughts = mp_docs
    else:
        mp_docs = thoughts

    tier_1 = thoughts['tier 1']
    
    reference = tier_1[['_id', 'url', 'title', 'chunks', 'score']].to_dict('records')

    return {'Reference': reference}

def flush():
  return None

examples = [
    ["Will Russia win the war in Ukraine?"],

    ]

demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
                     inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
                     outputs="json",examples=examples)

demo.launch()