maxiw commited on
Commit
8a04aff
1 Parent(s): 7df7460

clean up and add markdownify

Browse files
Files changed (2) hide show
  1. app.py +20 -13
  2. requirements.txt +2 -1
app.py CHANGED
@@ -2,19 +2,23 @@ import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import spaces
4
  import re
 
5
 
6
 
7
  models = {
8
- "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).to("cuda").eval(),
 
9
  }
10
 
11
  tokenizers = {
12
  "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
 
13
  }
14
 
15
 
16
  @spaces.GPU
17
- def run_example(html_content, model_id="jinaai/reader-lm-0.5b"):
 
18
  model = models[model_id]
19
  tokenizer = tokenizers[model_id]
20
  messages = [{"role": "user", "content": html_content}]
@@ -23,7 +27,9 @@ def run_example(html_content, model_id="jinaai/reader-lm-0.5b"):
23
  outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
24
  pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
25
  assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
26
- return assistant_response[0]
 
 
27
 
28
 
29
  css = """
@@ -37,16 +43,17 @@ css = """
37
  with gr.Blocks(css=css) as demo:
38
  gr.Markdown("""
39
  # HTML-to-Markdown
 
40
  """)
41
- with gr.Tab(label="Main"):
42
- with gr.Row():
43
- with gr.Column():
44
- model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-0.5b")
45
- html_content = gr.Textbox(label="HTML")
46
- submit_btn = gr.Button(value="Submit")
47
- with gr.Column():
48
- output_text = gr.Textbox(label="Markdown")
49
-
50
- submit_btn.click(run_example, [html_content, model_selector], [output_text])
51
 
52
  demo.launch(debug=True)
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import spaces
4
  import re
5
+ from markdownify import markdownify
6
 
7
 
8
  models = {
9
+ "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
10
+ "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
11
  }
12
 
13
  tokenizers = {
14
  "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
15
+ "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
16
  }
17
 
18
 
19
  @spaces.GPU
20
+ def run_example(html_content, model_id):
21
+ print("Start Model Processing")
22
  model = models[model_id]
23
  tokenizer = tokenizers[model_id]
24
  messages = [{"role": "user", "content": html_content}]
 
27
  outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
28
  pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
29
  assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
30
+ print("Start Markdownify Processing")
31
+ markdownify_output = markdownify(html_content)
32
+ return assistant_response[0], markdownify_output
33
 
34
 
35
  css = """
 
43
  with gr.Blocks(css=css) as demo:
44
  gr.Markdown("""
45
  # HTML-to-Markdown
46
+ Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
47
  """)
48
+ with gr.Row():
49
+ with gr.Column():
50
+ model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-0.5b")
51
+ html_content = gr.Textbox(label="HTML")
52
+ submit_btn = gr.Button(value="Submit")
53
+ with gr.Column():
54
+ model_output_text = gr.Textbox(label="Reader LM Output")
55
+ markdownify_output = gr.Textbox(label="Markdownify Output")
56
+
57
+ submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])
58
 
59
  demo.launch(debug=True)
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- transformers<=4.43.4
 
 
1
+ transformers<=4.43.4
2
+ markdownify==0.13.1