Spaces:

legacy107
/

flan-t5-large-ia3-covidqa

Runtime error

App Files Files Community

legacy107 commited on Oct 11, 2023

Commit

4723439

•

1 Parent(s): 1d6aee5

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -8

app.py CHANGED Viewed

@@ -5,6 +5,10 @@ from peft import PeftModel
 import torch
 import datasets
 from sentence_transformers import CrossEncoder
 # Load cross encoder
 top_k = 10
@@ -29,6 +33,69 @@ dataset = datasets.load_dataset("minh21/COVID-QA-Chunk-64-testset-biencoder-data
 dataset = dataset.shuffle()
 dataset = dataset.select(range(5))
 def paraphrase_answer(question, answer):
     # Combine question and context
@@ -70,9 +137,8 @@ def retrieve_context(question, contexts):
 # Define your function to generate answers
-def generate_answer(question, context, contexts):
-    if type(contexts) is str:
-        contexts = contexts.split(',')
     context = retrieve_context(question, contexts)
     # Combine question and context
@@ -97,7 +163,7 @@ def generate_answer(question, context, contexts):
     # Paraphrase answer
     paraphrased_answer = paraphrase_answer(question, generated_answer)
-    return generated_answer, paraphrased_answer
 # Define a function to list examples from the dataset
@@ -105,9 +171,8 @@ def list_examples():
     examples = []
     for example in dataset:
         context = example["context"]
-        contexts = example["context_chunks"]
         question = example["question"]
-        examples.append([question, context, contexts])
     return examples
@@ -116,11 +181,11 @@ iface = gr.Interface(
     fn=generate_answer,
     inputs=[
         Textbox(label="Question"),
-        Textbox(label="Context"),
-        Textbox(label="Contexts")
     ],
     outputs=[
         Textbox(label="Generated Answer"),
         Textbox(label="Natural Answer")
     ],
     examples=list_examples()

 import torch
 import datasets
 from sentence_transformers import CrossEncoder
+import re
+from nltk import sent_tokenize, word_tokenize
+import nltk
+nltk.download('punkt')
 # Load cross encoder
 top_k = 10
 dataset = dataset.shuffle()
 dataset = dataset.select(range(5))
+# Context chunking
+min_sentences_per_chunk = 3
+chunk_size = 64
+window_size = math.ceil(min_sentences_per_chunk * 0.25)
+over_lap_chunk_size = chunk_size * 0.25
+def chunk_splitter(context):
+    sentences = sent_tokenize(context)
+    chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        if len(current_chunk) < min_sentences_per_chunk:
+            current_chunk.append(sentence)
+            continue
+        elif len(word_tokenize(' '.join(current_chunk) + " " + sentence)) < chunk_size:
+            current_chunk.append(sentence)
+            continue
+        chunks.append(' '.join(current_chunk))
+        new_chunk = current_chunk[-window_size:]
+        new_window = window_size
+        buffer_new_chunk = new_chunk
+        while len(word_tokenize(' '.join(new_chunk))) <= over_lap_chunk_size:
+            buffer_new_chunk = new_chunk
+            new_window += 1
+            new_chunk = current_chunk[-new_window:]
+            if new_window >= len(current_chunk):
+               break
+        current_chunk = buffer_new_chunk
+        current_chunk.append(sentence)
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+def clean_data(text):
+    # Extract abstract content
+    index = text.find("\nAbstract: ")
+    if index != -1:
+        cleaned_text = text[index + len("\nAbstract: "):]
+    else:
+        cleaned_text = text  # If "\nAbstract: " is not found, keep the original text
+    # Remove both http and https links using a regular expression
+    cleaned_text = re.sub(r'(http(s|)\/\/:( |)\S+)|(http(s|):\/\/( |)\S+)', '', cleaned_text)
+    # Remove DOI patterns like "doi:10.1371/journal.pone.0007211.s003"
+    cleaned_text = re.sub(r'doi:( |)\w+', '', cleaned_text)
+    # Remove the "(0.11 MB DOC)" pattern
+    cleaned_text = re.sub(r'\(0\.\d+ MB DOC\)', '', cleaned_text)
+    cleaned_text = re.sub(r'www\.\w+(.org|)', '', cleaned_text)
+    return cleaned_text
 def paraphrase_answer(question, answer):
     # Combine question and context
 # Define your function to generate answers
+def generate_answer(question, context):
+    contexts = chunk_splitter(clean_data(context))
     context = retrieve_context(question, contexts)
     # Combine question and context
     # Paraphrase answer
     paraphrased_answer = paraphrase_answer(question, generated_answer)
+    return generated_answer, context, paraphrased_answer
 # Define a function to list examples from the dataset
     examples = []
     for example in dataset:
         context = example["context"]
         question = example["question"]
+        examples.append([question, context])
     return examples
     fn=generate_answer,
     inputs=[
         Textbox(label="Question"),
+        Textbox(label="Context")
     ],
     outputs=[
         Textbox(label="Generated Answer"),
+        Textbox(label="Retrieved Context"),
         Textbox(label="Natural Answer")
     ],
     examples=list_examples()