Spaces:

Pclanglais
/

Editorialization

Runtime error

App Files Files Community

Pclanglais commited on Jul 2, 2024

Commit

750020e

verified ·

1 Parent(s): 100e33a

Create app.py

Browse files

Files changed (1) hide show

app.py +146 -0

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import transformers
+import re
+from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM
+from vllm import LLM, SamplingParams
+import torch
+import gradio as gr
+import json
+import os
+import shutil
+import requests
+import chromadb
+import difflib
+import pandas as pd
+from chromadb.config import Settings
+from chromadb.utils import embedding_functions
+# Define the device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_checkpoint = "PleIAs/Estienne"
+token_classifier = pipeline(
+    "token-classification", model=editorial_model, aggregation_strategy="simple", device=device
+)
+tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
+def split_text(text, max_tokens=500):
+    # Split the text by newline characters
+    parts = text.split("\n")
+    chunks = []
+    current_chunk = ""
+    for part in parts:
+        # Add part to current chunk
+        if current_chunk:
+            temp_chunk = current_chunk + "\n" + part
+        else:
+            temp_chunk = part
+        # Tokenize the temporary chunk
+        num_tokens = len(tokenizer.tokenize(temp_chunk))
+        if num_tokens <= max_tokens:
+            current_chunk = temp_chunk
+        else:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = part
+    if current_chunk:
+        chunks.append(current_chunk)
+    # If no newlines were found and still exceeding max_tokens, split further
+    if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
+        long_text = chunks[0]
+        chunks = []
+        while len(tokenizer.tokenize(long_text)) > max_tokens:
+            split_point = len(long_text) // 2
+            while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
+                split_point += 1
+            # Ensure split_point does not go out of range
+            if split_point >= len(long_text):
+                split_point = len(long_text) - 1
+            chunks.append(long_text[:split_point].strip())
+            long_text = long_text[split_point:].strip()
+        if long_text:
+            chunks.append(long_text)
+    return chunks
+#Curtesy of claude
+def generate_html_diff(old_text, new_text):
+    d = difflib.Differ()
+    diff = list(d.compare(old_text.split(), new_text.split()))
+    html_diff = []
+    for word in diff:
+        if word.startswith('  '):
+            html_diff.append(word[2:])
+        elif word.startswith('+ '):
+            html_diff.append(f'<span style="background-color: #90EE90;">{word[2:]}</span>')
+        # We're not adding anything for words that start with '- '
+    return ' '.join(html_diff)
+# Class to encapsulate the Falcon chatbot
+class MistralChatBot:
+    def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
+        self.system_prompt = system_prompt
+    def predict(self, user_message):
+        #We drop the newlines.
+        editorial_text =  re.sub("\n", " ¶ ", user_message)
+        # Tokenize the prompt and check if it exceeds 500 tokens
+        num_tokens = len(tokenizer.tokenize(prompt))
+        if num_tokens > 500:
+            # Split the prompt into chunks
+            batch_prompts = split_text(prompt, max_tokens=500)
+        else:
+            batch_prompts = [prompt]
+        out = token_classifier(batch_prompts)
+        out = "".join(out)
+        generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + html_diff + "</div>"
+        return generated_text
+# Create the Falcon chatbot instance
+mistral_bot = MistralChatBot()
+# Define the Gradio interface
+title = "Éditorialisation"
+description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)"
+examples = [
+    [
+        "Qui peut bénéficier de l'AIP?",  # user_message
+        0.7  # temperature
+    ]
+]
+additional_inputs=[
+    gr.Slider(
+        label="Température",
+        value=0.2,  # Default value
+        minimum=0.05,
+        maximum=1.0,
+        step=0.05,
+        interactive=True,
+        info="Des valeurs plus élevées donne plus de créativité, mais aussi d'étrangeté",
+    ),
+]
+demo = gr.Blocks()
+with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=css) as demo:
+    gr.HTML("""<h1 style="text-align:center">Correction d'OCR</h1>""")
+    text_input = gr.Textbox(label="Votre texte.", type="text", lines=1)
+    text_button = gr.Button("Identifier les structures éditoriales")
+    text_output = gr.HTML(label="Le texte corrigé")
+    text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output])
+if __name__ == "__main__":
+    demo.queue().launch()