Spaces:

ajanco
/

isbn-finder

Runtime error

apjanco commited on Jul 26, 2022

Commit

907a067

•

1 Parent(s): a8ca5c6

handle ISBN without hyphens

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 import textract
 from pathlib import Path
@@ -21,6 +22,8 @@ pattern = [{'IS_DIGIT': True},
            {'IS_DIGIT': True}]
 matcher.add("ISBN",[pattern])
 isbn = """"""
 uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
@@ -30,9 +33,13 @@ for uploaded_file in uploaded_files:
     Path(uploaded_file.name).write_bytes(uploaded_file.read())
     text = textract.process(uploaded_file.name)
     text = text.decode('utf-8')
     doc = nlp(text)
     matches = matcher(doc)
-    st.write(f'Found {len(matches)} ISBN numbers')
     for match_id,start,end in matches:
         isbn += f"{doc[start:end]}\n"

+import re
 import streamlit as st
 import textract
 from pathlib import Path
            {'IS_DIGIT': True}]
 matcher.add("ISBN",[pattern])
+regex = re.compile('(\d{13})')
 isbn = """"""
 uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
     Path(uploaded_file.name).write_bytes(uploaded_file.read())
     text = textract.process(uploaded_file.name)
     text = text.decode('utf-8')
+    re_matches = regex.findall(text)
+    for match in re_matches:
+        isbn += match + '\n'
     doc = nlp(text)
     matches = matcher(doc)
+    st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
     for match_id,start,end in matches:
         isbn += f"{doc[start:end]}\n"