File size: 4,374 Bytes
cb35e87
 
 
 
 
 
 
 
 
 
 
385bf5d
 
fb9e6d1
96d3a8b
385bf5d
 
 
cb35e87
 
 
 
 
 
 
385bf5d
 
cb35e87
 
 
 
 
 
 
 
 
655c971
e146ae1
 
 
 
 
cb35e87
e146ae1
 
 
 
cb35e87
 
e146ae1
cb35e87
 
 
 
 
 
655c971
 
 
cb35e87
655c971
cb35e87
655c971
 
 
 
 
 
cb35e87
655c971
cb35e87
655c971
 
 
cb35e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655c971
 
cb35e87
 
 
 
 
 
 
01b1b14
cb35e87
 
 
 
 
 
 
01b1b14
 
cb35e87
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import http.client as http_client
import json
import logging
import os
import re
import string

import gradio as gr
import requests


def mark_tokens_bold(string, tokens):
  for token in tokens:
    pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
    string = re.sub(pattern, "<span style='color: red;'><b>" + token + "</b></span>", string)
  return string


def process_results(results, highlight_terms):
    if len(results) == 0:
        return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
                No results retrieved.</p><br><hr>"""

    results_html = ""
    for result in results:
        text_html = result["text"]
        text_html = mark_tokens_bold(text_html, highlight_terms)
        meta_html = (
            """
                <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
                <a href='{}' target='_blank'>{}</a></p>""".format(
                result["meta"]["url"], result["meta"]["url"]
            )
            if "meta" in result and result["meta"] is not None and "url" in result["meta"]
            else ""
        )
        docid_html = str(result["docid"])

        licenses = " | ".join(result["repo_license"])
        repo_name = result["repo_name"]
        repo_path = result["repo_path"]
        
        results_html += """{}
            <p style='font-size:16px; font-family: Arial; text-align: left;'>Repository name: <span style='color: #20233fff;'>{}</span></p>
            <p style='font-size:16px; font-family: Arial; text-align: left;'>Repository path: <span style='color: #20233fff;'>{}</span></p>
            <p style='font-size:16px; font-family: Arial; text-align: left;'>Repository licenses: <span style='color: #20233fff;'>{}</span></p>
            <pre style='height: 600px; overflow: scroll;'><code>{}</code></pre>
            <br>
        """.format(
            meta_html, repo_name, repo_path, licenses, text_html
        )
    return results_html + "<hr>"


def scisearch(query, language, num_results=10):

    query = " ".join(query.split())
    if query == "" or query is None:
        return ""

    post_data = {"query": query, "k": num_results}

    output = requests.post(
        os.environ.get("address"),
        headers={"Content-type": "application/json"},
        data=json.dumps(post_data),
        timeout=60,
    )

    payload = json.loads(output.text)

    results = payload["results"]
    highlight_terms = payload["highlight_terms"]
    return process_results(results, highlight_terms)


description = """# <p style="text-align: center;"> 🌸 🔎 ROOTS search tool 🔍 🌸 </p>
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
ROOTS. You can read more about the details of the tool design
[here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""


if __name__ == "__main__":
    demo = gr.Blocks(
        css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
    )

    with demo:
        with gr.Row():
            gr.Markdown(value=description)
        with gr.Row():
            query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")

            
        with gr.Row():
            k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
        with gr.Row():
            submit_btn = gr.Button("Submit")
        with gr.Row():
            results = gr.HTML(label="Results")

        def submit(query, k, lang="en"):
            query = query.strip()
            if query is None or query == "":
                return "", ""
            return {
                results: scisearch(query, lang, k),
            }

        query.submit(fn=submit, inputs=[query, k], outputs=[results])
        submit_btn.click(submit, inputs=[query, k], outputs=[results])

    demo.launch(enable_queue=True, debug=True)