search / app.py
loubnabnl's picture
loubnabnl HF staff
intemediate chnages
808f2e9
raw
history blame
4.55 kB
import json
import logging
import os
import re
import string
import gradio as gr
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
es = Elasticsearch(os.environ.get("host"), timeout=100, http_compress=True, maxsize=1000)
def mark_tokens_bold(string, tokens):
for token in tokens:
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
string = re.sub(pattern, "<span style='color: #e6b800;'><b>" + token + "</b></span>", string)
return string
def process_results(results):
if len(results) == 0:
return """<br><p>No results retrieved.</p><br><hr>"""
results_html = ""
for result in results:
text_html = result["text"]
# text_html = mark_tokens_bold(text_html, highlight_terms)
repository = result["repository"]
license = result["license"]
language = result["language"]
code_height = min(600, len(text_html.split('\n')) * 20) # limit to maximum height of 600px
results_html += """\
<p style='font-size:16px; text-align: left;'><b>Source: </b><span style='color: #00134d;'>{}</span>&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;<b>Language:</b> \
<span style='color: #00134d;'>{}</span>&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;<b>License: </b><span style='color: #00134d;'>{}</span></p>
<br>
<pre style='height: {}px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #e6b800; padding: 10px'><code>{}</code></pre>
<br>
<hr>
<br>
""".format(repository, language, license, code_height, text_html)
return results_html
def match_query(query, num_results=10):
s = Search(using=es, index=os.environ.get("index"))
s.query = Q("match", content=query)
s = s[:num_results]
response = s.execute()
return response
def phrase_query(query, num_results=10):
s = Search(using=es, index=os.environ.get("index"))
s.query = Q("match_phrase", content=query)
s = s[:num_results]
response = s.execute()
return response
def search(query, num_results=10):
if query.startswith('"') and query.endswith('"'):
print("HERE")
response = phrase_query(query[1:-1], num_results=num_results)
print(len(response))
else:
response = match_query(query, num_results=num_results)
results = [{"text": hit.content, "repository": f"{hit.repository}/{hit.path}", "license": hit.license[0], "language": hit.language} for hit in response]
return process_results(results)
description = """# <p style="text-align: center;"><span style='color: #e6b800;'>StarCoder:</span> Dataset Search πŸ” </p>
<span>When using <a href="https://huggingface.co/bigcode/large-model" style="color: #e6b800;">StarCoder</a> to generate code, it might produce exact copies of code in the pretraining dataset. \
In that case, the code license might have requirements to comply with. With this search tool, our aim is to help in identifying if the code belongs to an existing repository. For exact matches, enclose your query in double quotes.</span>"""
theme = gr.themes.Monochrome(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
radius_size=gr.themes.sizes.radius_sm,
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
)
css = ".generating {visibility: hidden}"
monospace_css = """
#q-input textarea {
font-family: monospace, 'Consolas', Courier, monospace;
}
"""
css = monospace_css + ".gradio-container {color: black}"
if __name__ == "__main__":
demo = gr.Blocks(
theme=theme,
css=css,
)
with demo:
with gr.Row():
gr.Markdown(value=description)
with gr.Row():
query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query", elem_id="q-input")
with gr.Row():
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
with gr.Row():
submit_btn = gr.Button("Submit")
with gr.Row():
results = gr.HTML(label="Results", value="")
def submit(query, k, lang="en"):
query = query.strip()
if query is None or query == "":
return "", ""
return {
results: search(query, k),
}
query.submit(fn=submit, inputs=[query, k], outputs=[results])
submit_btn.click(submit, inputs=[query, k], outputs=[results])
demo.launch(enable_queue=True, debug=True)