Spaces:
Runtime error
Runtime error
import streamlit as st | |
import textract | |
from pathlib import Path | |
import spacy | |
from spacy.matcher import Matcher | |
st.title('Find 13-digit ISBN Numbers') | |
st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg') | |
nlp = spacy.blank('xx') | |
nlp.max_length = 1200000 | |
matcher = Matcher(nlp.vocab) | |
pattern = [{'IS_DIGIT': True}, | |
{'ORTH': '-'}, | |
{'IS_DIGIT': True}, | |
{'ORTH': '-'}, | |
{'IS_DIGIT': True}, | |
{'ORTH': '-'}, | |
{'IS_DIGIT': True}, | |
{'ORTH': '-'}, | |
{'IS_DIGIT': True}] | |
matcher.add("ISBN",[pattern]) | |
isbn = """""" | |
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True) | |
for uploaded_file in uploaded_files: | |
file_type = uploaded_file.type | |
#TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile | |
Path(uploaded_file.name).write_bytes(uploaded_file.read()) | |
text = textract.process(uploaded_file.name) | |
text = text.decode('utf-8') | |
doc = nlp(text) | |
matches = matcher(doc) | |
st.write(f'Found {len(matches)} ISBN numbers') | |
for match_id,start,end in matches: | |
isbn += f"{doc[start:end]}\n" | |
st.download_button('Download', isbn, 'text/plain') | |