Spaces:
Build error
Build error
handle ISBN without hyphens
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import textract
|
3 |
from pathlib import Path
|
@@ -21,6 +22,8 @@ pattern = [{'IS_DIGIT': True},
|
|
21 |
{'IS_DIGIT': True}]
|
22 |
matcher.add("ISBN",[pattern])
|
23 |
|
|
|
|
|
24 |
isbn = """"""
|
25 |
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
|
26 |
|
@@ -30,9 +33,13 @@ for uploaded_file in uploaded_files:
|
|
30 |
Path(uploaded_file.name).write_bytes(uploaded_file.read())
|
31 |
text = textract.process(uploaded_file.name)
|
32 |
text = text.decode('utf-8')
|
|
|
|
|
|
|
|
|
33 |
doc = nlp(text)
|
34 |
matches = matcher(doc)
|
35 |
-
st.write(f'Found {len(matches)} ISBN numbers')
|
36 |
for match_id,start,end in matches:
|
37 |
isbn += f"{doc[start:end]}\n"
|
38 |
|
|
|
1 |
+
import re
|
2 |
import streamlit as st
|
3 |
import textract
|
4 |
from pathlib import Path
|
|
|
22 |
{'IS_DIGIT': True}]
|
23 |
matcher.add("ISBN",[pattern])
|
24 |
|
25 |
+
regex = re.compile('(\d{13})')
|
26 |
+
|
27 |
isbn = """"""
|
28 |
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
|
29 |
|
|
|
33 |
Path(uploaded_file.name).write_bytes(uploaded_file.read())
|
34 |
text = textract.process(uploaded_file.name)
|
35 |
text = text.decode('utf-8')
|
36 |
+
re_matches = regex.findall(text)
|
37 |
+
for match in re_matches:
|
38 |
+
isbn += match + '\n'
|
39 |
+
|
40 |
doc = nlp(text)
|
41 |
matches = matcher(doc)
|
42 |
+
st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
|
43 |
for match_id,start,end in matches:
|
44 |
isbn += f"{doc[start:end]}\n"
|
45 |
|