import streamlit as st import textract from pathlib import Path import spacy from spacy.matcher import Matcher st.title('Find 13-digit ISBN Numbers') st.image('') nlp = spacy.blank('xx') nlp.max_length = 1200000 matcher = Matcher(nlp.vocab) pattern = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}] matcher.add("ISBN",[pattern]) isbn = """""" uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) for uploaded_file in uploaded_files: file_type = uploaded_file.type #TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile Path( text = textract.process( text = text.decode('utf-8') doc = nlp(text) matches = matcher(doc) st.write(f'Found {len(matches)} ISBN numbers') for match_id,start,end in matches: isbn += f"{doc[start:end]}\n" st.download_button('Download', isbn, 'text/plain')