apjanco commited on
Commit
907a067
1 Parent(s): a8ca5c6

handle ISBN without hyphens

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import textract
3
  from pathlib import Path
@@ -21,6 +22,8 @@ pattern = [{'IS_DIGIT': True},
21
  {'IS_DIGIT': True}]
22
  matcher.add("ISBN",[pattern])
23
 
 
 
24
  isbn = """"""
25
  uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
26
 
@@ -30,9 +33,13 @@ for uploaded_file in uploaded_files:
30
  Path(uploaded_file.name).write_bytes(uploaded_file.read())
31
  text = textract.process(uploaded_file.name)
32
  text = text.decode('utf-8')
 
 
 
 
33
  doc = nlp(text)
34
  matches = matcher(doc)
35
- st.write(f'Found {len(matches)} ISBN numbers')
36
  for match_id,start,end in matches:
37
  isbn += f"{doc[start:end]}\n"
38
 
 
1
+ import re
2
  import streamlit as st
3
  import textract
4
  from pathlib import Path
 
22
  {'IS_DIGIT': True}]
23
  matcher.add("ISBN",[pattern])
24
 
25
+ regex = re.compile('(\d{13})')
26
+
27
  isbn = """"""
28
  uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
29
 
 
33
  Path(uploaded_file.name).write_bytes(uploaded_file.read())
34
  text = textract.process(uploaded_file.name)
35
  text = text.decode('utf-8')
36
+ re_matches = regex.findall(text)
37
+ for match in re_matches:
38
+ isbn += match + '\n'
39
+
40
  doc = nlp(text)
41
  matches = matcher(doc)
42
+ st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
43
  for match_id,start,end in matches:
44
  isbn += f"{doc[start:end]}\n"
45