apjanco commited on
Commit
5ef0d94
1 Parent(s): 5e06212

first commit!

Browse files
Files changed (3) hide show
  1. app.py +39 -0
  2. packages.txt +16 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import textract
3
+ from pathlib import Path
4
+ import spacy
5
+ from spacy.matcher import Matcher
6
+
7
+ st.title('Find 13-digit ISBN Numbers')
8
+ st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')
9
+
10
+ nlp = spacy.blank('xx')
11
+ nlp.max_length = 1200000
12
+ matcher = Matcher(nlp.vocab)
13
+ pattern = [{'IS_DIGIT': True},
14
+ {'ORTH': '-'},
15
+ {'IS_DIGIT': True},
16
+ {'ORTH': '-'},
17
+ {'IS_DIGIT': True},
18
+ {'ORTH': '-'},
19
+ {'IS_DIGIT': True},
20
+ {'ORTH': '-'},
21
+ {'IS_DIGIT': True}]
22
+ matcher.add("ISBN",[pattern])
23
+
24
+ isbn = """isbn\n"""
25
+ uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
26
+
27
+ for uploaded_file in uploaded_files:
28
+ file_type = uploaded_file.type
29
+ Path(uploaded_file.name).write_bytes(uploaded_file.read())
30
+ text = textract.process(uploaded_file.name)
31
+ text = text.decode('utf-8')
32
+ doc = nlp(text)
33
+ matches = matcher(doc)
34
+ st.write(f'Found {len(matches)} ISBN numbers')
35
+ for match_id,start,end in matches:
36
+ isbn += f"{doc[start:end]}\n"
37
+
38
+ st.download_button('Download CSV', isbn, 'text/csv')
39
+
packages.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dev
2
+ libxml2-dev
3
+ libxslt1-dev
4
+ antiword
5
+ unrtf
6
+ poppler-utils
7
+ pstotext
8
+ tesseract-ocr
9
+ flac
10
+ ffmpeg
11
+ lame
12
+ libmad0
13
+ libsox-fmt-mp3
14
+ sox
15
+ libjpeg-dev
16
+ swig
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit==1.2.0
2
+ textract==1.6.5
3
+ spacy==3.4.0