cakiki commited on
Commit
8de89ad
โ€ข
0 Parent(s):

Duplicate from bigcode/py-search

Browse files
Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +98 -0
  4. requirements.txt +2 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.pdf filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: StarCoder Python Search
3
+ emoji: ๐Ÿ”Ž๐Ÿ“‘๐Ÿ
4
+ colorFrom: red
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: bigcode/py-search
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+ import string
6
+
7
+ import gradio as gr
8
+ from elasticsearch import Elasticsearch
9
+ from elasticsearch_dsl import Search, Q
10
+
11
+ es = Elasticsearch(os.environ.get("host"), timeout=10)
12
+
13
+ def mark_tokens_bold(string, tokens):
14
+ for token in tokens:
15
+ pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
16
+ string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string)
17
+ return string
18
+
19
+
20
+ def process_results(results):
21
+ if len(results) == 0:
22
+ return """<br><p>No results retrieved.</p><br><hr>"""
23
+
24
+ results_html = ""
25
+ for result in results:
26
+ text_html = result["text"]
27
+ # text_html = mark_tokens_bold(text_html, highlight_terms)
28
+ repository = result["repository"]
29
+
30
+ results_html += """\
31
+ <p style='font-size:16px; text-align: left; color: white;'>Repository: <span style='color: #727cd6;'>{}</span></p>
32
+ <br>
33
+ <pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre>
34
+ <br>
35
+ <hr>
36
+ <br>
37
+ """.format(repository, text_html)
38
+ return results_html
39
+
40
+
41
+ def match_query(query, num_results=10):
42
+ s = Search(using=es, index=os.environ.get("index"))
43
+ s.query = Q("match", source__content=query)
44
+ s = s[:num_results]
45
+ response = s.execute()
46
+ return response
47
+
48
+ def phrase_query(query, num_results=10):
49
+ s = Search(using=es, index=os.environ.get("index"))
50
+ s.query = Q("match_phrase", source__content=query)
51
+ s = s[:num_results]
52
+ response = s.execute()
53
+ return response
54
+
55
+ def search(query, num_results=10):
56
+ if query[0]=='"' and query[-1]=='"':
57
+ response = phrase_query(query, num_results=num_results)
58
+ else:
59
+ response = match_query(query, num_results=num_results)
60
+ results = [{"text": hit.source.content, "repository": f"{hit.source.username}/{hit.source.repository}"} for hit in response]
61
+ return process_results(results)
62
+
63
+
64
+ description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>StarCoder:</span> Python Dataset Search ๐Ÿ” </p>
65
+ <span style='color: white;'>When you use <a href="https://huggingface.co/bigcode/large-model" style="color: #ff75b3;">StarCoder</a> to generate code it might produce exact copies of code in the pretraining dataset.
66
+ In that case, the code license might have requirements to comply with.
67
+ With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by StarCoder belongs to an already existing repository. For exact matches, enclose your query in double quotes.</span>"""
68
+
69
+
70
+ if __name__ == "__main__":
71
+ demo = gr.Blocks(
72
+ css=".gradio-container {background-color: #20233fff; color:white}"
73
+ )
74
+
75
+ with demo:
76
+ with gr.Row():
77
+ gr.Markdown(value=description)
78
+ with gr.Row():
79
+ query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query")
80
+ with gr.Row():
81
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
82
+ with gr.Row():
83
+ submit_btn = gr.Button("Submit")
84
+ with gr.Row():
85
+ results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>")
86
+
87
+ def submit(query, k, lang="en"):
88
+ query = query.strip()
89
+ if query is None or query == "":
90
+ return "", ""
91
+ return {
92
+ results: search(query, k),
93
+ }
94
+
95
+ query.submit(fn=submit, inputs=[query, k], outputs=[results])
96
+ submit_btn.click(submit, inputs=[query, k], outputs=[results])
97
+
98
+ demo.launch(enable_queue=True, debug=True)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ elasticsearch
2
+ elasticsearch-dsl