lvwerra's picture
lvwerra HF staff
Update app.py
28bd1d5
raw history blame
No virus
3.75 kB
import http.client as http_client
import json
import logging
import os
import re
import string
import gradio as gr
import requests
def mark_tokens_bold(string, tokens):
for token in tokens:
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string)
return string
def process_results(results, highlight_terms):
if len(results) == 0:
return """<br><p>No results retrieved.</p><br><hr>"""
results_html = ""
for result in results:
text_html = result["text"]
text_html = mark_tokens_bold(text_html, highlight_terms)
docid_html = str(result["docid"])
licenses = " | ".join(result["repo_license"])
repo_name = result["repo_name"]
repo_path = result["repo_path"]
results_html += """\
<p style='font-size:16px; text-align: left; color: white;'>Repository name: <span style='color: #727cd6;'>{}</span></p>
<p style='font-size:16px; text-align: left; color: white;'>Repository path: <span style='color: #727cd6;'>{}</span></p>
<p style='font-size:16px; text-align: left; color: white;'>Repository licenses: <span style='color: #727cd6;'>{}</span></p>
<br>
<pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre>
<br>
<hr>
<br>
""".format(repo_name, repo_path, licenses, text_html)
return results_html
def scisearch(query, language, num_results=10):
query = " ".join(query.split())
if query == "" or query is None:
return ""
post_data = {"query": query, "k": num_results}
output = requests.post(
os.environ.get("address"),
headers={"Content-type": "application/json"},
data=json.dumps(post_data),
timeout=60,
)
payload = json.loads(output.text)
results = payload["results"]
highlight_terms = payload["highlight_terms"]
return process_results(results, highlight_terms)
description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>πŸŽ… SantaCoder:</span> Dataset Search πŸ” </p>
<span style='color: white;'>When you use <a href="todo" style="color: #ff75b3;">IceCoder</a> to generate code it might produce exact copies of code in the pretraining dataset. In that case the code requires
and with this search tool we aim to provide help to finding out where the code came from.</span>"""
if __name__ == "__main__":
demo = gr.Blocks(
css=".gradio-container {background-color: #20233fff; color:white}"
)
with demo:
with gr.Row():
gr.Markdown(value=description)
with gr.Row():
query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query")
with gr.Row():
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
with gr.Row():
submit_btn = gr.Button("Submit")
with gr.Row():
results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>")
def submit(query, k, lang="en"):
query = query.strip()
if query is None or query == "":
return "", ""
return {
results: scisearch(query, lang, k),
}
query.submit(fn=submit, inputs=[query, k], outputs=[results])
submit_btn.click(submit, inputs=[query, k], outputs=[results])
demo.launch(enable_queue=True, debug=True)