search

Running

File size: 3,997 Bytes

8de89ad
 
 
 
 
 
 
 
 
 
390f7bf
8de89ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f7fd94
8de89ad
 
 
 
 
 
 
 
 
 
 
273f67e
8de89ad
 
 
 
7f7fd94
8de89ad
273f67e
8de89ad
 
 
 
 
d09c314
82ae5dc
8b2c11c
82ae5dc
8de89ad
 
7f7fd94
8de89ad
 
 
273f67e
8de89ad

import json
import logging
import os
import re
import string

import gradio as gr
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

es = Elasticsearch(os.environ.get("host"), timeout=100, http_compress=True, maxsize=1000)

def mark_tokens_bold(string, tokens):
  for token in tokens:
    pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
    string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string)
  return string


def process_results(results):
    if len(results) == 0:
        return """<br><p>No results retrieved.</p><br><hr>"""

    results_html = ""
    for result in results:
        text_html = result["text"]
        # text_html = mark_tokens_bold(text_html, highlight_terms)
        repository = result["repository"]
        
        results_html += """\
            <p style='font-size:16px; text-align: left; color: white;'>Source: <span style='color: #727cd6;'>{}</span></p>
            <br>
            <pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre>
            <br>
            <hr>
            <br>
        """.format(repository, text_html)
    return results_html


def match_query(query, num_results=10):
    s = Search(using=es, index=os.environ.get("index"))
    s.query = Q("match", content=query)
    s = s[:num_results]
    response = s.execute()
    return response

def phrase_query(query, num_results=10):
    s = Search(using=es, index=os.environ.get("index"))
    s.query = Q("match_phrase", content=query)
    s = s[:num_results]
    response = s.execute()
    return response

def search(query, num_results=10):
    if query.startswith('"') and query.endswith('"'):
        print("HERE")
        response = phrase_query(query[1:-1], num_results=num_results)
        print(len(response))
    else:
        response = match_query(query, num_results=num_results)
    results = [{"text": hit.content, "repository": f"{hit.username}/{hit.repository}/{hit.path}"} for hit in response]
    return process_results(results)


description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>StarCoder:</span> Dataset Search 🔍 </p>
<span style='color: white;'>When you use <a href="https://huggingface.co/bigcode/large-model" style="color: #ff75b3;">StarCoder</a> to generate code it might produce exact copies of code in the pretraining dataset. 
In that case, the code license might have requirements to comply with. 
With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by StarCoder belongs to an already existing repository. For exact matches, enclose your query in double quotes.</span>"""


if __name__ == "__main__":
    demo = gr.Blocks(
        css=".gradio-container {background-color: #20233fff; color:white}"
    )

    with demo:
        with gr.Row():
            gr.Markdown(value=description)
        with gr.Row():
            query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query")
        with gr.Row():
            k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
        with gr.Row():
            submit_btn = gr.Button("Submit")
        with gr.Row():
            results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>")

        def submit(query, k, lang="en"):
            query = query.strip()
            if query is None or query == "":
                return "", ""
            return {
                results: search(query, k),
            }

        query.submit(fn=submit, inputs=[query, k], outputs=[results])
        submit_btn.click(submit, inputs=[query, k], outputs=[results])

    demo.launch(enable_queue=True, debug=True)