Spaces:
Build error
Build error
File size: 1,552 Bytes
907a067 5ef0d94 907a067 bac843b 5ef0d94 a8ca5c6 5ef0d94 907a067 5ef0d94 907a067 5ef0d94 b8f3559 5ef0d94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import re
import streamlit as st
import textract
from pathlib import Path
import spacy
from spacy.matcher import Matcher
st.title('Find 13-digit ISBN Numbers')
st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')
nlp = spacy.blank('xx')
nlp.max_length = 1200000
matcher = Matcher(nlp.vocab)
pattern = [{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True}]
matcher.add("ISBN",[pattern])
regex = re.compile('(\d{13})')
isbn = """"""
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
for uploaded_file in uploaded_files:
file_type = uploaded_file.type
#TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile
Path(uploaded_file.name).write_bytes(uploaded_file.read())
text = textract.process(uploaded_file.name)
text = text.decode('utf-8')
re_matches = regex.findall(text)
for match in re_matches:
isbn += match + '\n'
doc = nlp(text)
matches = matcher(doc)
st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
for match_id,start,end in matches:
isbn += f"{doc[start:end]}\n"
st.download_button('Download', isbn, 'text/plain')
|