Spaces:

chasetank
/

manual_assistant

Paused

App Files Files Community

chasetank commited on Sep 12, 2023

Commit

4cf7340

•

1 Parent(s): 8bef1e4

Upload 2 files

Browse files

Files changed (2) hide show

InnovationHub/llm/chain.py +127 -0
InnovationHub/llm/vector_store.py +179 -0

InnovationHub/llm/chain.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio
+from langchain.embeddings import HuggingFaceBgeEmbeddings
+from langchain.vectorstores import FAISS
+from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate
+from langchain.chains.conversation.memory import ConversationBufferMemory
+def chat(question, vehicle, k=10, temperature=0.01):
+    chatgpt_chain = create_chatgpt_chain(temperature=temperature)
+    response = ask_question(question=question, vehicle=vehicle, k=k, embeddings=model_norm, chatgpt_chain=chatgpt_chain)
+    return response
+def create_chatgpt_chain(temperature):
+    template = """
+    {chat_history}
+    Human: {question}
+    AI:
+    """
+    prompt_template = PromptTemplate(input_variables=["chat_history", "question"], template=template)
+    return LLMChain(llm=OpenAI(temperature=temperature,model_name="gpt-3.5-turbo"),prompt=prompt_template,verbose=True,memory=ConversationBufferMemory(memory_key="chat_history"))
+def ask_question(question, vehicle, k, embeddings, chatgpt_chain):
+    index = FAISS.load_local(folder_path=db_paths[vehicle], embeddings=embeddings)
+    prompt = get_prompt(question=question, vehicle=vehicle, k=k)
+    response = chatgpt_chain.run(question=prompt)
+    return response
+def get_prompt(question, vehicle, k):
+    prompt = f"""
+    I need information from my {vehicle} manual.
+    I will provide an excerpt from the manual. Use the excerpt and nothing else to answer the question.
+    You must refer to the excerpt as "{vehicle} Manual" in your response. Here is the excerpt:
+    """
+    index = FAISS.load_local(folder_path=db_paths[vehicle], embeddings=model_norm)
+    similar_docs = index.similarity_search(query=question, k=k)
+    context = []
+    for d in similar_docs:
+        content = d.page_content
+        context.append(content)
+    user_input = prompt + '\n[EXCERPT]' + '\n'.join(context[:k]) + '\nQuestion: ' + question
+    return user_input
+db_paths = {
+  "2023 AMG C-Coupe-Cab": "data/amg_c_coupe_cab",
+  "2023 AMG C-Sedan": "data/amg_c_sedan",
+  "2023 AMG E-Coupe-Cab": "data/amg_e_coupe_cab",
+  "2023 AMG E-Sedan_wagon": "data/amg_e_sedan_wagon",
+  "2023 AMG_EQE-Sedan": "data/amg_eqe_sedan",
+  "2023 AMG_GLE-suv": "data/amg_gle_suv",
+  "2023 AMG_GLS SUV": "data/amg_gls_suv",
+  "2023 C-Cab": "data/c_cab",
+  "2023 C-Coupe": "data/c_coupe",
+  "2023 C-Sedan": "data/c_sedan",
+  "2023 CLA": "data/cla",
+  "2023 E-Cab": "data/e_cab",
+  "2023 E-Coupe": "data/e_coupe",
+  "2023 E-Sedan": "data/e_sedan",
+  "2023 E-wagon": "data/e_wagon",
+  "2023 eqb SUV": "data/eqb_suv",
+  "2023 EQE-Sedan": "data/eqe_sedan",
+  "2023 EQS_Sedan": "data/eqs_sedan",
+  "2023 EQS SUV": "data/eqs_suv",
+  "2023 GLA": "data/gla",
+  "2023 GLB": "data/glb",
+  "2023 GLC-Coupe": "data/glc_coupe",
+  "2023 GLE-Coupe": "data/gle_coupe",
+  "2023 GLE-suv": "data/gle_suv",
+  "2023 GLS SUV": "data/gls_suv"
+}
+vehicle_options = [
+    "2023 AMG C-Coupe-Cab",
+    "2023 AMG C-Sedan",
+    "2023 AMG E-Coupe-Cab",
+    "2023 AMG E-Sedan_wagon",
+    "2023 AMG_EQE-Sedan",
+    "2023 AMG_GLE-suv",
+    "2023 AMG_GLS SUV",
+    "2023 C-Cab",
+    "2023 C-Coupe",
+    "2023 C-Sedan",
+    "2023 CLA",
+    "2023 E-Cab",
+    "2023 E-Coupe",
+    "2023 E-Sedan",
+    "2023 E-wagon",
+    "2023 eqb SUV",
+    "2023 EQE-Sedan",
+    "2023 EQS SUV",
+    "2023 EQS_Sedan",
+    "2023 GLA",
+    "2023 GLB",
+    "2023 GLC-Coupe",
+    "2023 GLE-Coupe",
+    "2023 GLE-suv",
+    "2023 GLS SUV",
+]
+model_name = "BAAI/bge-large-en"
+model_kwargs = {'device': 'cpu'}
+encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+model_norm = HuggingFaceBgeEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
+def start_ui():
+    chatbot_interface = gradio.Interface(
+        fn=chat,
+        inputs=["text",
+                gradio.inputs.Dropdown(vehicle_options, label="Select Mercedes-Benz Owner's Manual")
+                #gradio.inputs.Slider(minimum=1, maximum=10, step=1, label="k")
+        ],
+        outputs="text",
+        title="Mercedes-Benz Owner's Manual",
+        description="Ask a question and get answers from Mercedes-Benz Owner's Manual.<u>Disclaimer:</u> THIS IS NOT OFFICIAL AND MAY NOT BE AVAILABLE ALL THE TIME. ALWAYS LOOK AT THE OFFICIAL DOCUMENTATION at https://www.mbusa.com/en/owners/manuals",
+        examples=[["What are the different features of the dashboard console?", "2023 S-Class", 10, 0.01],
+                  ["What is flacon? Which page has that information? Show me all the exact content from that page", "2023 S-Class", 10, 0.01],
+                  ["What is hyperscreen?", "2023 EQS", 10, 0.01],
+                  ["Where can I find my vin?", "2023 EQS", 10, 0.01],
+                  ["Does it take more than 30 minutes to charge? Which page has that information? Show me all the exact content from that page", "2023 EQE", 10, 0.01]],
+        article = '<center><img src="https://visitor-badge.glitch.me/badge?page_id=kaushikdatta.owner-manual" alt="visitor badge"/></center>'
+    )
+    chatbot_interface.launch()

InnovationHub/llm/vector_store.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import plotly.graph_objs as go
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+import plotly.express as px
+import numpy as np
+import os
+import pprint
+import codecs
+import chardet
+import gradio as gr
+from langchain.llms import HuggingFacePipeline
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate
+from langchain.memory import ConversationBufferWindowMemory
+def get_content(input_file):
+    # Read the input file in binary mode
+    with open(input_file, 'rb') as f:
+        raw_data = f.read()
+    # Detect the encoding of the file
+    result = chardet.detect(raw_data)
+    encoding = result['encoding']
+    # Decode the contents using the detected encoding
+    with codecs.open(input_file, 'r', encoding=encoding) as f:
+        raw_text = f.read()
+    # Return the content of the input file
+    return raw_text
+def split_text(input_file, chunk_size=1000, chunk_overlap=0):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+    )
+    basename = os.path.basename(input_file)
+    basename = os.path.splitext(basename)[0]
+    raw_text = get_content(input_file=input_file)
+    texts = text_splitter.split_text(text=raw_text)
+    metadatas = [{"source": f"{basename}[{i}]"} for i in range(len(texts))]
+    docs = text_splitter.create_documents(texts=texts, metadatas=metadatas)
+    return texts, metadatas, docs
+def create_docs(input_file):
+    # Create a text splitter object with a separator character
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=0,
+        length_function=len,
+    )
+    basename = os.path.basename(input_file)
+    basename = os.path.splitext(basename)[0]
+    texts = get_content(input_file=input_file)
+    metadatas = {'source': basename}
+    docs = text_splitter.create_documents(texts=[texts], metadatas=[metadatas])
+    return docs
+def get_similar_docs(query, index, k=5):
+    similar_docs = index.similarity_search(query=query, k=k)
+    result = [(d.summary, d.metadata) for d in similar_docs]
+    return result
+def convert_to_html(similar_docs):
+    result = []
+    for summary, metadata in similar_docs:
+        record = '<tr><td>' + summary + '</td><td>' + \
+            metadata['source'] + '</td></tr>'
+        result.append(record)
+    html = '<table><thead><th>Page Content</th><th>Source</th></thead><tbody>' + \
+        '\n'.join(result) + '</tbody></table>'
+    return html
+def create_similarity_plot(embeddings, labels, query, n_clusters=3):
+    # Only include embeddings that have corresponding labels
+    embeddings_with_labels = [
+        embedding for i, embedding in enumerate(embeddings) if i < len(labels)]
+    # Reduce the dimensionality of the embeddings using PCA
+    pca = PCA(n_components=3)
+    pca_embeddings = pca.fit_transform(embeddings_with_labels)
+    # Cluster the embeddings using k-means
+    kmeans = KMeans(n_clusters=n_clusters)
+    kmeans.fit(embeddings_with_labels)
+    # Create a trace for the query point
+    query_trace = go.Scatter3d(
+        x=[pca_embeddings[-1, 0]],
+        y=[pca_embeddings[-1, 1]],
+        z=[pca_embeddings[-1, 2]],
+        mode='markers',
+        marker=dict(
+            color='black',
+            symbol='diamond',
+            size=10
+        ),
+        name=f"Query: '{query}'"
+    )
+    # Create a trace for the other points
+    points_trace = go.Scatter3d(
+        x=pca_embeddings[:, 0],
+        y=pca_embeddings[:, 1],
+        z=pca_embeddings[:, 2],
+        mode='markers',
+        marker=dict(
+            color=kmeans.labels_,
+            colorscale=px.colors.qualitative.Alphabet,
+            size=5
+        ),
+        text=labels,
+        name='Points'
+    )
+    # Create the figure
+    fig = go.Figure(data=[query_trace, points_trace])
+    # Add a title and legend
+    fig.update_layout(
+        title="3D Similarity Plot",
+        legend_title_text="Cluster"
+    )
+    # Show the plot
+    fig.show()
+def plot_similarities(query, index, embeddings=HuggingFaceEmbeddings(), k=5):
+    query_embeddings = embeddings.embed_query(text=query)
+    similar_docs = get_similar_docs(query=query, index=index, k=k)
+    texts = []
+    for d in similar_docs:
+        texts.append(d[0])
+    embeddings_array = embeddings.embed_documents(texts=texts)
+    # Get the index of the query point
+    query_index = len(embeddings_array) - 1
+    create_similarity_plot(
+        embeddings=embeddings_array,
+        labels=texts,
+        query_index=query_index,
+        n_clusters=3
+    )
+def start_ui(index):
+    def query_index(query):
+        similar_docs = get_similar_docs(query=query, index=index)
+        formatted_output = convert_to_html(similar_docs=similar_docs)
+        return formatted_output
+    # Define input and output types
+    input = gr.inputs.Textbox(lines=2)
+    output = gr.outputs.HTML()
+    # Create interface object
+    iface = gr.Interface(fn=query_index,
+                         inputs=input,
+                         outputs=output)
+    # Launch interface
+    iface.launch()