crazyjames commited on
Commit
977fb98
1 Parent(s): d833c98
Files changed (3) hide show
  1. README.md +20 -5
  2. app.py +172 -0
  3. requirement.txt +13 -0
README.md CHANGED
@@ -1,13 +1,28 @@
1
  ---
2
  title: LongCite
3
- emoji: 🏢
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.42.0
 
 
8
  app_file: app.py
9
- pinned: false
10
- license: apache-2.0
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
  title: LongCite
3
+ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
+ suggested_hardware: a100-large
9
+ app_port: 7860
10
  app_file: app.py
11
+ models:
12
+ - THUDM/LongCite-glm4-9b
13
+ tags:
14
+ - long-context
15
+ - chat
16
+ - thudm
17
+ short_description: LLM for long context
18
+ disable_embedding: false
19
  ---
20
 
21
+ # LongWriter
22
+
23
+ ## How to run this space
24
+ Run with the Code
25
+
26
+ ```shell
27
+ python app.py
28
+ ```
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import (
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ )
8
+ import docx
9
+ import PyPDF2
10
+
11
+ def convert_to_txt(file):
12
+ doc_type = file.split(".")[-1].strip()
13
+ if doc_type in ["txt", "md", "py"]:
14
+ data = [file.read().decode('utf-8')]
15
+ elif doc_type in ["pdf"]:
16
+ pdf_reader = PyPDF2.PdfReader(file)
17
+ data = [pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))]
18
+ elif doc_type in ["docx"]:
19
+ doc = docx.Document(file)
20
+ data = [p.text for p in doc.paragraphs]
21
+ else:
22
+ raise gr.Error(f"ERROR: unsupported document type: {doc_type}")
23
+ text = "\n\n".join(data)
24
+ return text
25
+
26
+ model_name = "THUDM/LongCite-glm4-9b"
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map='auto')
29
+
30
+ html_styles = """<style>
31
+ .reference {
32
+ color: blue;
33
+ text-decoration: underline;
34
+ }
35
+ .highlight {
36
+ background-color: yellow;
37
+ }
38
+ .label {
39
+ font-family: sans-serif;
40
+ font-size: 16px;
41
+ font-weight: bold;
42
+ }
43
+ .Bold {
44
+ font-weight: bold;
45
+ }
46
+ .statement {
47
+ background-color: lightgrey;
48
+ }
49
+ </style>\n"""
50
+
51
+ def process_text(text):
52
+ special_char={
53
+ '&': '&amp;',
54
+ '\'': '&apos;',
55
+ '"': '&quot;',
56
+ '<': '&lt;',
57
+ '>': '&gt;',
58
+ '\n': '<br>',
59
+ }
60
+ for x, y in special_char.items():
61
+ text = text.replace(x, y)
62
+ return text
63
+
64
+ def convert_to_html(statements, clicked=-1):
65
+ html = html_styles + '<br><span class="label">Answer:</span><br>\n'
66
+ all_cite_html = []
67
+ clicked_cite_html = None
68
+ cite_num2idx = {}
69
+ idx = 0
70
+ for i, js in enumerate(statements):
71
+ statement, citations = process_text(js['statement']), js['citation']
72
+ if clicked == i:
73
+ html += f"""<span class="statement">{statement}</span>"""
74
+ else:
75
+ html += f"<span>{statement}</span>"
76
+ if citations:
77
+ cite_html = []
78
+ idxs = []
79
+ for c in citations:
80
+ idx += 1
81
+ idxs.append(str(idx))
82
+ cite = '[Sentence: {}-{}\t|\tChar: {}-{}]<br>\n<span {}>{}</span>'.format(c['start_sentence_idx'], c['end_sentence_idx'], c['start_char_idx'], c['end_char_idx'], 'class="highlight"' if clicked==i else "", process_text(c['cite'].strip()))
83
+ cite_html.append(f"""<span><span class="Bold">Snippet [{idx}]:</span><br>{cite}</span>""")
84
+ all_cite_html.extend(cite_html)
85
+ cite_num = '[{}]'.format(','.join(idxs))
86
+ cite_num2idx[cite_num] = i
87
+ cite_num_html = """ <span class="reference" style="color: blue" id={}>{}</span>""".format(i, cite_num)
88
+ html += cite_num_html
89
+ html += '\n'
90
+ if clicked == i:
91
+ clicked_cite_html = html_styles + """<br><span class="label">Citations of current statement:</span><br><div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(cite_html))
92
+ all_cite_html = html_styles + """<br><span class="label">All citations:</span><br>\n<div style="overflow-y: auto; padding: 20px; border: 0px dashed black; border-radius: 6px; background-color: #EFF2F6;">{}</div>""".format("<br><br>\n".join(all_cite_html).replace('<span class="highlight">', '<span>') if len(all_cite_html) else "No citation in the answer")
93
+ return html, all_cite_html, clicked_cite_html, cite_num2idx
94
+
95
+ def render_context(file):
96
+ if hasattr(file, "name"):
97
+ context = convert_to_txt(file.name)
98
+ return gr.Textbox(context, visible=True)
99
+ else:
100
+ raise gr.Error(f"ERROR: no uploaded document")
101
+
102
+ def run_llm(context, query):
103
+ if not context:
104
+ raise gr.Error("Error: no uploaded document")
105
+ if not query:
106
+ raise gr.Error("Error: no query")
107
+ result = model.query_longcite(context, query, tokenizer=tokenizer, max_input_length=128000, max_new_tokens=1024)
108
+ all_statements = result['all_statements']
109
+ answer_html, all_cite_html, clicked_cite_html, cite_num2idx_dict = convert_to_html(all_statements)
110
+ cite_nums = list(cite_num2idx_dict.keys())
111
+ return {
112
+ statements: gr.JSON(all_statements),
113
+ answer: gr.HTML(answer_html, visible=True),
114
+ all_citations: gr.HTML(all_cite_html, visible=True),
115
+ cite_num2idx: gr.JSON(cite_num2idx_dict),
116
+ citation_choices: gr.Radio(cite_nums, visible=len(cite_nums)>0),
117
+ clicked_citations: gr.HTML(visible=False),
118
+ }
119
+
120
+ def chose_citation(statements, cite_num2idx, clicked_cite_num):
121
+ clicked = cite_num2idx[clicked_cite_num]
122
+ answer_html, _, clicked_cite_html, _ = convert_to_html(statements, clicked=clicked)
123
+ return {
124
+ answer: gr.HTML(answer_html, visible=True),
125
+ clicked_citations: gr.HTML(clicked_cite_html, visible=True),
126
+ }
127
+
128
+ with gr.Blocks() as demo:
129
+ gr.Markdown(
130
+ """
131
+ <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
132
+ LongCite-glm4-9b Huggingface Space🤗
133
+ </div>
134
+ <div style="text-align: center;">
135
+ <a href="https://huggingface.co/THUDM/LongCite-glm4-9b">🤗 Model Hub</a> |
136
+ <a href="https://github.com/THUDM/LongCite">🌐 Github</a> |
137
+ <a href="https://arxiv.org/pdf/">📜 arxiv </a>
138
+ </div>
139
+ <br>
140
+ <div style="text-align: center; font-size: 15px; font-weight: bold; margin-bottom: 20px; line-height: 1.5;">
141
+ If you plan to use it long-term, please consider deploying the model or forking this space yourself.
142
+ </div>
143
+ """
144
+ )
145
+
146
+ with gr.Row():
147
+ with gr.Column(scale=4):
148
+ file = gr.File(label="Upload a document (supported type: pdf, docx, txt, md, py)")
149
+ query = gr.Textbox(label='Question')
150
+ submit_btn = gr.Button("Submit")
151
+
152
+ with gr.Column(scale=4):
153
+ context = gr.Textbox(label="Document content", autoscroll=False, placeholder="No uploaded document.", max_lines=10, visible=False)
154
+
155
+ file.upload(render_context, [file], [context])
156
+
157
+ with gr.Row():
158
+ with gr.Column(scale=4):
159
+ statements = gr.JSON(label="statements", visible=False)
160
+ answer = gr.HTML(label="Answer", visible=True)
161
+ cite_num2idx = gr.JSON(label="cite_num2idx", visible=False)
162
+ citation_choices = gr.Radio(label="Chose citations for details", visible=False, interactive=True)
163
+
164
+ with gr.Column(scale=4):
165
+ clicked_citations = gr.HTML(label="Citations of the chosen statement", visible=False)
166
+ all_citations = gr.HTML(label="All citations", visible=False)
167
+
168
+ submit_btn.click(run_llm, [context, query], [statements, answer, all_citations, cite_num2idx, citation_choices, clicked_citations])
169
+ citation_choices.change(chose_citation, [statements, cite_num2idx, citation_choices], [answer, clicked_citations])
170
+
171
+ demo.queue()
172
+ demo.launch()
requirement.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.41.0
2
+ torch==2.3.1
3
+ transformers==4.43.0
4
+ spaces==0.29.2
5
+ accelerate==0.33.0
6
+ sentencepiece==0.2.0
7
+ huggingface-hub==0.24.5
8
+ sentencepiece==0.2.0
9
+ jinja2==3.1.4
10
+ sentence_transformers==3.0.1
11
+ tiktoken==0.7.0
12
+ einops==0.8.0
13
+ nltk==3.8.1