Pavan178 commited on
Commit
94cefaf
1 Parent(s): 18621da

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ import re
4
+ import tempfile
5
+ import os
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ import spaces
10
+
11
+
12
+
13
+ @spaces.GPU
14
+ def preprocess_text_for_tts(text):
15
+ text = re.sub(r'[^\x20-\x7E]', ' ', text)
16
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
17
+ text = re.sub(r'\S+@\S+', '', text)
18
+ text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
19
+ text = re.sub(r'\.{2,}', ' ', text)
20
+
21
+ def convert_case(match):
22
+ word = match.group(0)
23
+ common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
24
+ return word if word in common_abbreviations else word.title()
25
+
26
+ text = re.sub(r'\b[A-Z]+\b', convert_case, text)
27
+ text = re.sub(r'\s+', ' ', text)
28
+ text = re.sub(r'\.([A-Za-z])', r'. \1', text)
29
+ text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
30
+ text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
31
+ text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
32
+ text = re.sub(r'\s+', ' ', text).strip()
33
+
34
+ return text
35
+
36
+ # Check if CUDA (GPU) is available
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ print(f"Using device: {device}")
39
+
40
+ # Load the model and tokenizer
41
+ model_name = "sherif31/T5-Grammer-Correction" # Replace with your actual model name
42
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
44
+
45
+
46
+ def correct_text(text):
47
+ # Split the text into chunks to avoid exceeding max token limit
48
+ max_chunk_length = 512
49
+ chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
50
+ corrected_chunks = []
51
+
52
+ for chunk in chunks:
53
+ input_text = f"grammar: {chunk}"
54
+ input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
55
+
56
+ with torch.no_grad():
57
+ output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)
58
+
59
+ corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
60
+ corrected_chunks.append(corrected_chunk)
61
+
62
+ return ' '.join(corrected_chunks)
63
+
64
+ def extract_text_from_pages(pdf_bytes):
65
+ page_text_dict = {}
66
+
67
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
68
+ temp_pdf.write(pdf_bytes)
69
+ temp_pdf_path = temp_pdf.name
70
+
71
+ try:
72
+ with pdfplumber.open(temp_pdf_path) as pdf:
73
+ for page_num, page in enumerate(pdf.pages, 1):
74
+ raw_text = page.extract_text()
75
+ if raw_text:
76
+ cleaned_text = preprocess_text_for_tts(raw_text)
77
+ corrected_text = correct_text(cleaned_text)
78
+ page_text_dict[page_num] = corrected_text
79
+ else:
80
+ page_text_dict[page_num] = ""
81
+ finally:
82
+ os.unlink(temp_pdf_path)
83
+
84
+ return page_text_dict
85
+
86
+ def process_pdf(pdf_file):
87
+ if pdf_file is None:
88
+ return "No file uploaded. Please upload a PDF file."
89
+
90
+ result = extract_text_from_pages(pdf_file)
91
+
92
+ # Use ThreadPoolExecutor for parallel processing
93
+ with ThreadPoolExecutor() as executor:
94
+ corrected_texts = list(executor.map(correct_text, result.values()))
95
+
96
+ # Combine the results
97
+ output = ""
98
+ for page_num, text in zip(result.keys(), corrected_texts):
99
+ output += f"Page {page_num}:\n{text}\n\n"
100
+
101
+ return output
102
+
103
+ # Create the Gradio interface
104
+ iface = gr.Interface(
105
+ fn=process_pdf,
106
+ inputs=gr.File(label="Upload PDF", type="binary"),
107
+ outputs=gr.Textbox(label="Extracted and Processed Text"),
108
+ title="PDF Text Extractor and Processor",
109
+ description="Upload a PDF file to extract, clean, and correct its text content."
110
+ )
111
+
112
+ # Launch the app
113
+ iface.launch()