Spaces:

Pavankalyan
/

Chitti_ver1

Runtime error

App Files Files Community

Pavankalyan commited on Oct 8, 2022

Commit

17283b0

1 Parent(s): 4681ada

Upload 7 files

Browse files

Files changed (7) hide show

Responses.csv +0 -0
app.py +19 -0
corpus.pt +3 -0
data_process.py +44 -0
main.py +22 -0
requirements.txt +2 -0
retrieval.py +69 -0

Responses.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+from data_process import *
+from retrieval import *
+df = pd.read_csv("Responses.csv")
+text = list(df["text"].values)
+def chitti(query):
+    re_table = search(query, text)
+    return re_table[0][0]
+demo = gr.Interface(
+    fn=chitti,
+    inputs=["text"],
+    outputs=["text"],
+)
+demo.launch(share=True)

corpus.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90d8781fef8d3a3b5a5130ce095c186c076a05ee25e3980cc3cf2577910302b2
+size 5803755

data_process.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import re
+import pandas as pd
+def merge_text(text_list):
+    i = 0
+    j = 1
+    k = len(text_list)
+    while j < k:
+        if len(text_list[i].split()) <= 30:
+            text_list[j] = text_list[i] + " " + text_list[j]
+            text_list[i] = " "
+        i += 1
+        j += 1
+    return [accepted for accepted in text_list if accepted is not " "]
+def get_text(path):
+    doc_list = sorted(os.listdir(path))
+    text = []
+    for doc in doc_list:
+        sub_text = []
+        with open(os.path.join(path, doc), encoding='utf-8') as f:
+            for line in f.readlines():
+                temp_text = re.sub("\\n", "", line)
+                if temp_text != "":
+                    sub_text.append(temp_text)
+            sub_text = merge_text(sub_text)
+            text.extend(sub_text)
+    return text
+def dataframe(path):
+    text = get_text(path)
+    df = {
+        "text": text
+    }
+    df = pd.DataFrame(df)
+    df.to_csv("Responses.csv")

main.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from data_process import *
+from retrieval import *
+import argparse
+parser = argparse.ArgumentParser(description="Run the query for the bot")
+parser.add_argument('--query', help="Question to the bot", type=str, required=True)
+parser.add_argument('--data_path', help="Path for the stored dataset", type=str, required=True)
+args = parser.parse_args()
+path = args.data_path
+query = args.query
+if "Responses.csv" not in os.listdir(os.getcwd()):
+    dataframe(path)
+df = pd.read_csv("Responses.csv")
+text = list(df["text"].values)
+search(query, text)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ sentence-transformers
2	+ pandas

retrieval.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import textwrap
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import torch
+from tabulate import tabulate
+import time
+model_bi_encoder = "msmarco-distilbert-base-tas-b"
+model_cross_encoder = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+bi_encoder = SentenceTransformer(model_bi_encoder)
+bi_encoder.max_seq_length = 512
+cross_encoder = CrossEncoder(model_cross_encoder)
+top_k = 20
+def get_corpus(passages):
+    if "corpus.pt" not in os.listdir(os.getcwd()):
+        corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
+        torch.save(corpus_embeddings, "corpus.pt")
+    else:
+        corpus_embeddings = torch.load("corpus.pt")
+    return corpus_embeddings
+def search(query, passages):
+    corpus_embeddings = get_corpus(passages)
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    be = time.process_time()
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+    #print("Time taken by Bi-encoder:" + str(time.process_time() - be))
+    hits = hits[0]
+    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
+    ce = time.process_time()
+    cross_scores = cross_encoder.predict(cross_inp)
+    #print("Time taken by Cross-encoder:" + str(time.process_time() - ce))
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    result_table = list()
+    for hit in hits[0:5]:
+        ans = "{}".format(passages[hit['corpus_id']].replace("\n", " "))
+        #print(ans)
+        cs = "{}".format(hit['cross-score'])
+        #print(cs)
+        sc = "{}".format(hit['score'])
+        #print(sc)
+        wrapper = textwrap.TextWrapper(width=50)
+        ans = wrapper.fill(text=ans)
+        result_table.append([ans,str(cs),str(sc)])
+    return result_table
+    #print(tabulate(result_table, headers=["Answer", "Cross-encoder score", "Bi-encoder score"], tablefmt="fancy_grid", maxcolwidths=[None, None, None]))