Spaces:

Tonic
/

OCRonos-TextGen

Sleeping

App Files Files Community

Tonic commited on Sep 9, 2024

Commit

4929bc6

verified ·

1 Parent(s): fa951d7

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -26

app.py CHANGED Viewed

@@ -1,36 +1,76 @@
-from difflib import Differ
 import gradio as gr
-def diff_texts(text1, text2):
-    d = Differ()
-    return [
-        (token[2:], token[0] if token[0] != " " else None)
-        for token in d.compare(text1, text2)
-    ]
-demo = gr.Interface(
-    diff_texts,
-    [
-        gr.Textbox(
-            label="Text 1",
-            info="Initial text",
-            lines=3,
-            value="The quick brown fox jumped over the lazy dogs.",
-        ),
         gr.Textbox(
-            label="Text 2",
-            info="Text to compare",
-            lines=3,
-            value="The fast brown fox jumps over lazy dogs.",
         ),
     ],
-    gr.HighlightedText(
-        label="Diff",
         combine_adjacent=True,
-        show_legend=True,
-        color_map={"+": "red", "-": "green"}),
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
-    demo.launch()

+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import gradio as gr
+# Load pre-trained model and tokenizer
+model_name = "PleIAs/OCRonos-Vintage"
+model = GPT2LMHeadModel.from_pretrained(model_name)
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+# Set the device to GPU if available, otherwise use CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def historical_generation(prompt, max_new_tokens=600):
+    prompt = f"### Text ###\n{prompt}"
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Generate text
+    output = model.generate(input_ids,
+                            max_new_tokens=max_new_tokens,
+                            pad_token_id=tokenizer.eos_token_id,
+                            top_k=50,
+                            temperature=0.3,
+                            top_p=0.95,
+                            do_sample=True,
+                            repetition_penalty=1.5)
+    # Decode the generated text
+    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Remove the prompt from the generated text
+    generated_text = generated_text.replace("### Text ###\n", "").strip()
+    # Tokenize the generated text
+    tokens = tokenizer.tokenize(generated_text)
+    # Create highlighted text output
+    highlighted_text = []
+    for token in tokens:
+        # Remove special tokens and get the token type
+        clean_token = token.replace("Ġ", "").replace("</w>", "")
+        token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
+        highlighted_text.append((clean_token, token_type))
+    return highlighted_text
+# Create Gradio interface
+iface = gr.Interface(
+    fn=historical_generation,
+    inputs=[
         gr.Textbox(
+            label="Prompt",
+            placeholder="Enter a prompt for historical text generation...",
+            lines=3
         ),
+        gr.Slider(
+            label="Max New Tokens",
+            minimum=50,
+            maximum=1000,
+            step=50,
+            value=600
+        )
     ],
+    outputs=gr.HighlightedText(
+        label="Generated Historical Text",
         combine_adjacent=True,
+        show_legend=True
+    ),
+    title="Historical Text Generation with OCRonos-Vintage",
+    description="Generate historical-style text using the OCRonos-Vintage model. The output shows token types as highlights.",
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
+    iface.launch()