txtrct
Browse files- app.py +3 -21
- requirements.txt +1 -3
app.py
CHANGED
@@ -2,12 +2,8 @@ import streamlit as st
|
|
2 |
import torch
|
3 |
import torch.nn.functional as F
|
4 |
from torch import Tensor
|
5 |
-
|
6 |
-
import tempfile
|
7 |
import textract
|
8 |
-
import docx2txt
|
9 |
-
import pdfplumber
|
10 |
-
import io
|
11 |
import os
|
12 |
|
13 |
def last_token_pool(last_hidden_states: Tensor,
|
@@ -46,23 +42,9 @@ click = st.button("Search")
|
|
46 |
|
47 |
|
48 |
|
49 |
-
def extract_text(doc):
|
50 |
-
if doc.type == 'text/plain':
|
51 |
-
return doc.read().decode('utf-8')
|
52 |
-
|
53 |
-
if doc.name.endswith(".pdf"):
|
54 |
-
docPath = save_upload(doc)
|
55 |
-
|
56 |
-
|
57 |
-
with pdfplumber.open(docPath) as pdf:
|
58 |
-
pages = [page.extract_text() for page in pdf.pages]
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
if doc.name.endswith('.docx'):
|
64 |
-
raw_text = doc.read()
|
65 |
-
return docx2txt.process(raw_text)
|
66 |
|
67 |
return None
|
68 |
|
|
|
2 |
import torch
|
3 |
import torch.nn.functional as F
|
4 |
from torch import Tensor
|
5 |
+
|
|
|
6 |
import textract
|
|
|
|
|
|
|
7 |
import os
|
8 |
|
9 |
def last_token_pool(last_hidden_states: Tensor,
|
|
|
42 |
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
def extract_text(doc):
|
47 |
+
return textract.process(doc).decode('utf-8')
|
|
|
|
|
|
|
|
|
48 |
|
49 |
return None
|
50 |
|
requirements.txt
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
torch
|
2 |
transformers
|
3 |
-
textract
|
4 |
-
docx2txt
|
5 |
-
pdfplumber
|
|
|
1 |
torch
|
2 |
transformers
|
3 |
+
textract
|
|
|
|