isbn-finder / app.py
apjanco
adding post on process
a8ca5c6
raw
history blame
No virus
1.4 kB
import streamlit as st
import textract
from pathlib import Path
import spacy
from spacy.matcher import Matcher
st.title('Find 13-digit ISBN Numbers')
st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')
nlp = spacy.blank('xx')
nlp.max_length = 1200000
matcher = Matcher(nlp.vocab)
pattern = [{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True}]
matcher.add("ISBN",[pattern])
isbn = """"""
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
for uploaded_file in uploaded_files:
file_type = uploaded_file.type
#TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile
Path(uploaded_file.name).write_bytes(uploaded_file.read())
text = textract.process(uploaded_file.name)
text = text.decode('utf-8')
doc = nlp(text)
matches = matcher(doc)
st.write(f'Found {len(matches)} ISBN numbers')
for match_id,start,end in matches:
isbn += f"{doc[start:end]}\n"
st.download_button('Download', isbn, 'text/plain')