import re import streamlit as st import textract from pathlib import Path import spacy from spacy.matcher import Matcher st.title('Find 13-digit ISBN Numbers') st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg') nlp = spacy.blank('xx') nlp.max_length = 1200000 matcher = Matcher(nlp.vocab) pattern = [{'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}, {'ORTH': '-'}, {'IS_DIGIT': True}] matcher.add("ISBN",[pattern]) regex = re.compile('(\d{13})') isbn = """""" uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) for uploaded_file in uploaded_files: file_type = uploaded_file.type #TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile Path(uploaded_file.name).write_bytes(uploaded_file.read()) text = textract.process(uploaded_file.name) text = text.decode('utf-8') re_matches = regex.findall(text) for match in re_matches: isbn += match + '\n' doc = nlp(text) matches = matcher(doc) st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers') for match_id,start,end in matches: isbn += f"{doc[start:end]}\n" st.download_button('Download', isbn, 'text/plain')