edithram23 commited on
Commit
aba3b27
1 Parent(s): 3a1f54d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -76
app.py CHANGED
@@ -1,76 +1,83 @@
1
- from transformers import AutoTokenizer
2
- from transformers import AutoModelForSeq2SeqLM
3
- import streamlit as st
4
- import fitz # PyMuPDF
5
- from docx import Document
6
- import re
7
- import nltk
8
- nltk.download('punkt')
9
-
10
- def sentence_tokenize(text):
11
- sentences = nltk.sent_tokenize(text)
12
- return sentences
13
-
14
- model_dir_large = 'edithram23/Redaction_Personal_info_v1'
15
- tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
16
- model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
17
-
18
- def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
19
- inputs = ["Mask Generation: " + text+'.']
20
- inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
21
- output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
22
- decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
23
- predicted_title = decoded_output.strip()
24
- pattern = r'\[.*?\]'
25
- # Replace all occurrences of the pattern with [redacted]
26
- redacted_text = re.sub(pattern, '[redacted]', predicted_title)
27
- return redacted_text
28
-
29
-
30
-
31
- def read_pdf(file):
32
- pdf_document = fitz.open(stream=file.read(), filetype="pdf")
33
- text = ""
34
- for page_num in range(len(pdf_document)):
35
- page = pdf_document.load_page(page_num)
36
- text += page.get_text()
37
- return text
38
-
39
- def read_docx(file):
40
- doc = Document(file)
41
- text = "\n".join([para.text for para in doc.paragraphs])
42
- return text
43
-
44
- def read_txt(file):
45
- text = file.read().decode("utf-8")
46
- return text
47
-
48
- def process_file(file):
49
- if file.type == "application/pdf":
50
- return read_pdf(file)
51
- elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
52
- return read_docx(file)
53
- elif file.type == "text/plain":
54
- return read_txt(file)
55
- else:
56
- return "Unsupported file type."
57
-
58
- st.title("File Reader")
59
-
60
- uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
61
-
62
- if uploaded_file is not None:
63
- file_contents = process_file(uploaded_file)
64
- token = sentence_tokenize(file_contents)
65
- final=''
66
- for i in range(0, len(token)):
67
- final+=mask_generation(token[i])+'\n'
68
- processed_text = final
69
- st.text_area("File Contents", processed_text, height=400)
70
-
71
- st.download_button(
72
- label="Download Processed File",
73
- data=processed_text,
74
- file_name="processed_file.txt",
75
- mime="text/plain",
76
- )
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from transformers import AutoModelForSeq2SeqLM
3
+ import streamlit as st
4
+ import fitz # PyMuPDF
5
+ from docx import Document
6
+ import re
7
+ import nltk
8
+ nltk.download('punkt')
9
+
10
+ def sentence_tokenize(text):
11
+ sentences = nltk.sent_tokenize(text)
12
+ return sentences
13
+
14
+ model_dir_large = 'edithram23/Redaction_Personal_info_v1'
15
+ tokenizer_large = AutoTokenizer.from_pretrained(model_dir_large)
16
+ model_large = AutoModelForSeq2SeqLM.from_pretrained(model_dir_large)
17
+
18
+ def mask_generation(text,model=model_large,tokenizer=tokenizer_large):
19
+ if(len(text)<30):
20
+ text = text+'.'
21
+ inputs = ["Mask Generation: " + text.lower()+'.']
22
+ inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
23
+ output = model.generate(**inputs, num_beams=8, do_sample=True, max_length=len(text))
24
+ decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
25
+ predicted_title = decoded_output.strip()
26
+ pattern = r'\[.*?\]'
27
+ # Replace all occurrences of the pattern with [redacted]
28
+ redacted_text = re.sub(pattern, '[redacted]', predicted_title)
29
+ return redacted_text
30
+
31
+
32
+
33
+ def read_pdf(file):
34
+ pdf_document = fitz.open(stream=file.read(), filetype="pdf")
35
+ text = ""
36
+ for page_num in range(len(pdf_document)):
37
+ page = pdf_document.load_page(page_num)
38
+ text += page.get_text()
39
+ return text
40
+
41
+ def read_docx(file):
42
+ doc = Document(file)
43
+ text = "\n".join([para.text for para in doc.paragraphs])
44
+ return text
45
+
46
+ def read_txt(file):
47
+ text = file.read().decode("utf-8")
48
+ return text
49
+
50
+ def process_file(file):
51
+ if file.type == "application/pdf":
52
+ return read_pdf(file)
53
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
54
+ return read_docx(file)
55
+ elif file.type == "text/plain":
56
+ return read_txt(file)
57
+ else:
58
+ return "Unsupported file type."
59
+
60
+ st.title("File Reader")
61
+ user = st.text_input("Input Text to Redact")
62
+ uploaded_file = st.file_uploader("Upload a file", type=["pdf", "docx", "txt"])
63
+ if(user != ''):
64
+ token = sentence_tokenize(user)
65
+ final=''
66
+ for i in range(0, len(token)):
67
+ final+=mask_generation(token[i])+'\n'
68
+ st.text_area("OUTPUT",final,height=400)
69
+ if uploaded_file is not None:
70
+ file_contents = process_file(uploaded_file)
71
+ token = sentence_tokenize(file_contents)
72
+ final=''
73
+ for i in range(0, len(token)):
74
+ final+=mask_generation(token[i])+'\n'
75
+ processed_text = final
76
+ st.text_area("OUTPUT", processed_text, height=400)
77
+
78
+ st.download_button(
79
+ label="Download Processed File",
80
+ data=processed_text,
81
+ file_name="processed_file.txt",
82
+ mime="text/plain",
83
+ )