import http.client as http_client
import json
import logging
import os
import re
import string
import gradio as gr
import requests
def mark_tokens_bold(string, tokens):
for token in tokens:
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
string = re.sub(pattern, "" + token + "", string)
return string
def process_results(results, highlight_terms):
if len(results) == 0:
return """
No results retrieved.
Repository name: {}
Repository path: {}
Repository licenses: {}
🎅 SantaCoder: Dataset Search 🔍
When you use SantaCoder to generate code it might produce exact copies of code in the pretraining dataset. In that case, the code license might have requirements to comply with. With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by SantaCoder belongs to an already existing repository.""" if __name__ == "__main__": demo = gr.Blocks( css=".gradio-container {background-color: #20233fff; color:white}" ) with demo: with gr.Row(): gr.Markdown(value=description) with gr.Row(): query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query") with gr.Row(): k = gr.Slider(1, 100, value=10, step=1, label="Max Results") with gr.Row(): submit_btn = gr.Button("Submit") with gr.Row(): results = gr.HTML(label="Results", value="") def submit(query, k, lang="en"): query = query.strip() if query is None or query == "": return "", "" return { results: scisearch(query, lang, k), } query.submit(fn=submit, inputs=[query, k], outputs=[results]) submit_btn.click(submit, inputs=[query, k], outputs=[results]) demo.launch(enable_queue=True, debug=True)