OCRonos-TextCorrect

Sleeping

Pclanglais commited on Aug 4

Commit

1fca231

•

1 Parent(s): fa86caf

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import shutil
 import requests
 import pandas as pd
 import difflib
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -168,24 +169,32 @@ def split_text(text, max_tokens=500):
 # Function to generate text
-def ocr_correction(prompt, max_new_tokens=600):
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
     # Generate text
-    output = model.generate(input_ids,
-                            max_new_tokens=max_new_tokens,
-                            pad_token_id=tokenizer.eos_token_id,
-                            top_k=50)
     # Decode and return the generated text
     result = tokenizer.decode(output[0], skip_special_tokens=True)
     print(result)
     result = result.split("### Correction ###")[1]
     return result
 # OCR Correction Class

 import requests
 import pandas as pd
 import difflib
+from concurrent.futures import ThreadPoolExecutor
 # Define the device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Function to generate text
+def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Set the number of threads for PyTorch
+    torch.set_num_threads(num_threads)
     # Generate text
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future = executor.submit(
+            model.generate,
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=tokenizer.eos_token_id,
+            top_k=50,
+            num_return_sequences=1,
+            do_sample=True,
+            temperature=0.7
+        )
+        output = future.result()
     # Decode and return the generated text
     result = tokenizer.decode(output[0], skip_special_tokens=True)
     print(result)
     result = result.split("### Correction ###")[1]
     return result
 # OCR Correction Class