Spaces:

0x70DA
/

abs-qa-demo

Sleeping

App Files Files Community

0x70DA commited on Mar 22, 2023

Commit

633e625

•

1 Parent(s): a464a99

Add lots of code

Browse files

Files changed (3) hide show

README.md +3 -3
app.py +133 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Abstractive Qa Demo
-emoji: ⚡
-colorFrom: purple
 colorTo: purple
 sdk: gradio
 sdk_version: 3.23.0

 ---
+title: Abstractive QA Demo
+emoji: ❓
+colorFrom: pink
 colorTo: purple
 sdk: gradio
 sdk_version: 3.23.0

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from typing import List
+import faiss
+import numpy as np
+import gradio as gr
+import requests
+import torch
+from bs4 import BeautifulSoup
+from datasets import Dataset
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Load retriever model
+torch.set_grad_enabled(False)  # Disable gradients
+device = "cuda" if torch.cuda.is_available() else "cpu"
+retriever = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device=device)
+# Load generation model
+tokenizer = AutoTokenizer.from_pretrained("yjernite/bart_eli5")
+model = AutoModelForSeq2SeqLM.from_pretrained("yjernite/bart_eli5").to(device)
+def scrape(urls: List[str]) -> Dataset:
+    data = []
+    chunk_size = 100
+    # Extract the text inside all the <p> tags for each search result
+    for url in urls:
+        # Send the request and get the response
+        response = requests.get(url)
+        # Parse the response HTML with BeautifulSoup
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Find all the <p> tags in the HTML and extract their text
+        for string in soup.stripped_strings:
+            text = repr(string).split()
+            contexts = [
+                " ".join(text[i : i + chunk_size])
+                for i in range(0, len(text), chunk_size)
+            ]
+            for context in contexts:
+                if len(context.split()) >= 15:
+                    data.append({"context": context, "url": url})
+    return Dataset.from_list(data)
+def search_web(query: str) -> List[str]:
+    url = f"https://www.google.com/search?q={query}"
+    # Set the user agent to avoid being blocked by Google
+    headers = {
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
+    }
+    # Send the search request and get the response
+    response = requests.get(url, headers=headers)
+    # Parse the response HTML with BeautifulSoup
+    soup = BeautifulSoup(response.content, "html.parser")
+    # Find the search results in the HTML
+    search_results = soup.find_all("div", class_="g")
+    # Extract the title and URL of the top search results
+    urls = set()
+    for result in search_results[:10]:
+        url = result.find("a")["href"]
+        if url.startswith("http"):
+            urls.add(url)
+    return urls
+def generate_answer(question_doc: str) -> str:
+    q_toks = tokenizer.batch_encode_plus(
+        [question_doc], max_length=1024, pad_to_max_length=True
+    )
+    q_ids, q_mask = (
+        torch.LongTensor(q_toks["input_ids"]).to(device),
+        torch.LongTensor(q_toks["attention_mask"]).to(device),
+    )
+    model_output = model.generate(
+        input_ids=q_ids,
+        attention_mask=q_mask,
+        min_new_tokens=32,
+        max_new_tokens=256,
+        no_repeat_ngram_size=3,
+        num_beams=2,
+        do_sample=True,
+        length_penalty=1.5,
+    )
+    answer = tokenizer.batch_decode(model_output, skip_special_tokens=True)[0]
+    return answer.strip()
+def predict(question: str) -> str:
+    urls = search_web(question)
+    data = scrape(urls)
+    # Create vector embeddings and add Faiss index
+    data_with_embeds = data.map(
+        lambda batch: {"embeddings": retriever.encode(batch["context"])}, batched=True
+    )
+    data_with_embeds.add_faiss_index(
+        column="embeddings", metric_type=faiss.METRIC_INNER_PRODUCT
+    )
+    # Get the most relevant examples
+    scores, relevant_examples = data_with_embeds.get_nearest_examples(
+        "embeddings", retriever.encode([question]), k=20
+    )
+    doc = "<P> " + " <P> ".join(
+        relevant_examples["context"]
+    )  # The support document for the model
+    # Generate answer
+    question_doc = f"question: {question} context: {doc}"
+    return generate_answer(question_doc)
+input_box = gr.Textbox(label="Question")
+output_box = gr.Textbox(label="Answer")
+description = """
+<div style="text-align: center;">
+        <p style="font-style: italic;"> Disclaimer: This is just a stupid demo and it craches a lot. Don't take it too seriously.</p>
+        ✌😎
+</div>
+"""
+demo = gr.Interface(
+    fn=predict, inputs=input_box, outputs=output_box, description=description
+).queue()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers
+sentence-transformers
+datasets
+torch
+beautifulsoup4
+requests
+numpy
+faiss-cpu