Spaces:
Running
on
Zero
Running
on
Zero
crazyjames
commited on
Commit
•
977fb98
1
Parent(s):
d833c98
update
Browse files- README.md +20 -5
- app.py +172 -0
- requirement.txt +13 -0
README.md
CHANGED
@@ -1,13 +1,28 @@
|
|
1 |
---
|
2 |
title: LongCite
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
|
|
|
|
8 |
app_file: app.py
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: LongCite
|
3 |
+
emoji: 💬
|
4 |
colorFrom: yellow
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.41.0
|
8 |
+
suggested_hardware: a100-large
|
9 |
+
app_port: 7860
|
10 |
app_file: app.py
|
11 |
+
models:
|
12 |
+
- THUDM/LongCite-glm4-9b
|
13 |
+
tags:
|
14 |
+
- long-context
|
15 |
+
- chat
|
16 |
+
- thudm
|
17 |
+
short_description: LLM for long context
|
18 |
+
disable_embedding: false
|
19 |
---
|
20 |
|
21 |
+
# LongWriter
|
22 |
+
|
23 |
+
## How to run this space
|
24 |
+
Run with the Code
|
25 |
+
|
26 |
+
```shell
|
27 |
+
python app.py
|
28 |
+
```
|
app.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
from transformers import (
|
5 |
+
AutoModelForCausalLM,
|
6 |
+
AutoTokenizer,
|
7 |
+
)
|
8 |
+
import docx
|
9 |
+
import PyPDF2
|
10 |
+
|
11 |
+
def convert_to_txt(file):
|
12 |
+
doc_type = file.split(".")[-1].strip()
|
13 |
+
if doc_type in ["txt", "md", "py"]:
|
14 |
+
data = [file.read().decode('utf-8')]
|
15 |
+
elif doc_type in ["pdf"]:
|
16 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
17 |
+
data = [pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))]
|
18 |
+
elif doc_type in ["docx"]:
|
19 |
+
doc = docx.Document(file)
|
20 |
+
data = [p.text for p in doc.paragraphs]
|
21 |
+
else:
|
22 |
+
raise gr.Error(f"ERROR: unsupported document type: {doc_type}")
|
23 |
+
text = "\n\n".join(data)
|
24 |
+
return text
|
25 |
+
|
26 |
+
model_name = "THUDM/LongCite-glm4-9b"
|
27 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
28 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map='auto')
|
29 |
+
|
30 |
+
html_styles = """<style>
|
31 |
+
.reference {
|
32 |
+
color: blue;
|
33 |
+
text-decoration: underline;
|
34 |
+
}
|
35 |
+
.highlight {
|
36 |
+
background-color: yellow;
|
37 |
+
}
|
38 |
+
.label {
|
39 |
+
font-family: sans-serif;
|
40 |
+
font-size: 16px;
|
41 |
+
font-weight: bold;
|
42 |
+
}
|
43 |
+
.Bold {
|
44 |
+
font-weight: bold;
|
45 |
+
}
|
46 |
+
.statement {
|
47 |
+
background-color: lightgrey;
|
48 |
+
}
|
49 |
+
</style>\n"""
|
50 |
+
|
51 |
+
def process_text(text):
|
52 |
+
special_char={
|
53 |
+
'&': '&',
|
54 |
+
'\'': ''',
|
55 |
+
'"': '"',
|
56 |
+
'<': '<',
|
57 |
+
'>': '>',
|
58 |
+
'\n': '<br>',
|
59 |
+
}
|
60 |
+
for x, y in special_char.items():
|
61 |
+
text = text.replace(x, y)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def convert_to_html(statements, clicked=-1):
|
65 |
+
html = html_styles + '<br><span class="label">Answer:</span><br>\n'
|
66 |
+
all_cite_html = []
|
67 |
+
clicked_cite_html = None
|
68 |
+
cite_num2idx = {}
|
69 |
+
idx = 0
|
70 |
+
for i, js in enumerate(statements):
|
71 |
+
statement, citations = process_text(js['statement']), js['citation']
|
72 |
+
if clicked == i:
|
73 |
+
html += f"""<span class="statement">{statement}</span>"""
|
74 |
+
else:
|
75 |
+
html += f"<span>{statement}</span>"
|
76 |
+
if citations:
|
77 |
+
cite_html = []
|
78 |
+
idxs = []
|
79 |
+
for c in citations:
|
80 |
+
idx += 1
|
81 |
+
idxs.append(str(idx))
|
82 |
+
cite = '[Sentence: {}-{}\t|\tChar: {}-{}]<br>\n<span {}>{}</span>'.format(c['start_sentence_idx'], c['end_sentence_idx'], c['start_char_idx'], c['end_char_idx'], 'class="highlight"' if clicked==i else "", process_text(c['cite'].strip()))
|
83 |
+
cite_html.append(f"""<span><span class="Bold">Snippet [{idx}]:</span><br>{cite}</span>""")
|
84 |
+
all_cite_html.extend(cite_html)
|
85 |
+
cite_num = '[{}]'.format(','.join(idxs))
|
86 |
+
cite_num2idx[cite_num] = i
|
87 |
+
cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(i, cite_num)
|
88 |
+
html += cite_num_html
|
89 |
+
html += '\n'
|
90 |
+
if clicked == i:
|
91 |
+
clicked_cite_html = html_styles + """<br><span class="label">Citations of current statement:</span><br><div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(cite_html))
|
92 |
+
all_cite_html = html_styles + """<br><span class="label">All citations:</span><br>\n<div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(all_cite_html).replace('<span class="highlight">', '<span>') if len(all_cite_html) else "No citation in the answer")
|
93 |
+
return html, all_cite_html, clicked_cite_html, cite_num2idx
|
94 |
+
|
95 |
+
def render_context(file):
|
96 |
+
if hasattr(file, "name"):
|
97 |
+
context = convert_to_txt(file.name)
|
98 |
+
return gr.Textbox(context, visible=True)
|
99 |
+
else:
|
100 |
+
raise gr.Error(f"ERROR: no uploaded document")
|
101 |
+
|
102 |
+
def run_llm(context, query):
|
103 |
+
if not context:
|
104 |
+
raise gr.Error("Error: no uploaded document")
|
105 |
+
if not query:
|
106 |
+
raise gr.Error("Error: no query")
|
107 |
+
result = model.query_longcite(context, query, tokenizer=tokenizer, max_input_length=128000, max_new_tokens=1024)
|
108 |
+
all_statements = result['all_statements']
|
109 |
+
answer_html, all_cite_html, clicked_cite_html, cite_num2idx_dict = convert_to_html(all_statements)
|
110 |
+
cite_nums = list(cite_num2idx_dict.keys())
|
111 |
+
return {
|
112 |
+
statements: gr.JSON(all_statements),
|
113 |
+
answer: gr.HTML(answer_html, visible=True),
|
114 |
+
all_citations: gr.HTML(all_cite_html, visible=True),
|
115 |
+
cite_num2idx: gr.JSON(cite_num2idx_dict),
|
116 |
+
citation_choices: gr.Radio(cite_nums, visible=len(cite_nums)>0),
|
117 |
+
clicked_citations: gr.HTML(visible=False),
|
118 |
+
}
|
119 |
+
|
120 |
+
def chose_citation(statements, cite_num2idx, clicked_cite_num):
|
121 |
+
clicked = cite_num2idx[clicked_cite_num]
|
122 |
+
answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
|
123 |
+
return {
|
124 |
+
answer: gr.HTML(answer_html, visible=True),
|
125 |
+
clicked_citations: gr.HTML(clicked_cite_html, visible=True),
|
126 |
+
}
|
127 |
+
|
128 |
+
with gr.Blocks() as demo:
|
129 |
+
gr.Markdown(
|
130 |
+
"""
|
131 |
+
<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
|
132 |
+
LongCite-glm4-9b Huggingface Space🤗
|
133 |
+
</div>
|
134 |
+
<div style="text-align: center;">
|
135 |
+
<a href="https://huggingface.co/THUDM/LongCite-glm4-9b">🤗 Model Hub</a> |
|
136 |
+
<a href="https://github.com/THUDM/LongCite">🌐 Github</a> |
|
137 |
+
<a href="https://arxiv.org/pdf/">📜 arxiv </a>
|
138 |
+
</div>
|
139 |
+
<br>
|
140 |
+
<div style="text-align: center; font-size: 15px; font-weight: bold; margin-bottom: 20px; line-height: 1.5;">
|
141 |
+
If you plan to use it long-term, please consider deploying the model or forking this space yourself.
|
142 |
+
</div>
|
143 |
+
"""
|
144 |
+
)
|
145 |
+
|
146 |
+
with gr.Row():
|
147 |
+
with gr.Column(scale=4):
|
148 |
+
file = gr.File(label="Upload a document (supported type: pdf, docx, txt, md, py)")
|
149 |
+
query = gr.Textbox(label='Question')
|
150 |
+
submit_btn = gr.Button("Submit")
|
151 |
+
|
152 |
+
with gr.Column(scale=4):
|
153 |
+
context = gr.Textbox(label="Document content", autoscroll=False, placeholder="No uploaded document.", max_lines=10, visible=False)
|
154 |
+
|
155 |
+
file.upload(render_context, [file], [context])
|
156 |
+
|
157 |
+
with gr.Row():
|
158 |
+
with gr.Column(scale=4):
|
159 |
+
statements = gr.JSON(label="statements", visible=False)
|
160 |
+
answer = gr.HTML(label="Answer", visible=True)
|
161 |
+
cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
|
162 |
+
citation_choices = gr.Radio(label="Chose citations for details", visible=False, interactive=True)
|
163 |
+
|
164 |
+
with gr.Column(scale=4):
|
165 |
+
clicked_citations = gr.HTML(label="Citations of the chosen statement", visible=False)
|
166 |
+
all_citations = gr.HTML(label="All citations", visible=False)
|
167 |
+
|
168 |
+
submit_btn.click(run_llm, [context, query], [statements, answer, all_citations, cite_num2idx, citation_choices, clicked_citations])
|
169 |
+
citation_choices.change(chose_citation, [statements, cite_num2idx, citation_choices], [answer, clicked_citations])
|
170 |
+
|
171 |
+
demo.queue()
|
172 |
+
demo.launch()
|
requirement.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.41.0
|
2 |
+
torch==2.3.1
|
3 |
+
transformers==4.43.0
|
4 |
+
spaces==0.29.2
|
5 |
+
accelerate==0.33.0
|
6 |
+
sentencepiece==0.2.0
|
7 |
+
huggingface-hub==0.24.5
|
8 |
+
sentencepiece==0.2.0
|
9 |
+
jinja2==3.1.4
|
10 |
+
sentence_transformers==3.0.1
|
11 |
+
tiktoken==0.7.0
|
12 |
+
einops==0.8.0
|
13 |
+
nltk==3.8.1
|