File size: 1,552 Bytes
907a067
5ef0d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
907a067
 
bac843b
5ef0d94
 
 
 
a8ca5c6
5ef0d94
 
 
907a067
 
 
 
5ef0d94
 
907a067
5ef0d94
 
 
b8f3559
5ef0d94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re 
import streamlit as st
import textract
from pathlib import Path
import spacy
from spacy.matcher import Matcher

st.title('Find 13-digit ISBN Numbers')
st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')

nlp = spacy.blank('xx')
nlp.max_length = 1200000
matcher = Matcher(nlp.vocab)
pattern = [{'IS_DIGIT': True},
           {'ORTH': '-'},
           {'IS_DIGIT': True},
           {'ORTH': '-'},
           {'IS_DIGIT': True},
           {'ORTH': '-'},
           {'IS_DIGIT': True},
           {'ORTH': '-'},
           {'IS_DIGIT': True}]
matcher.add("ISBN",[pattern])

regex = re.compile('(\d{13})')

isbn = """"""
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)

for uploaded_file in uploaded_files:
    file_type = uploaded_file.type
    #TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile
    Path(uploaded_file.name).write_bytes(uploaded_file.read())
    text = textract.process(uploaded_file.name)
    text = text.decode('utf-8')
    re_matches = regex.findall(text)
    for match in re_matches:
        isbn += match + '\n'

    doc = nlp(text)
    matches = matcher(doc)
    st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')    
    for match_id,start,end in matches:
        isbn += f"{doc[start:end]}\n"

st.download_button('Download', isbn, 'text/plain')