isbn-finder / app.py
apjanco
handle ISBN without hyphens
907a067
raw
history blame contribute delete
No virus
1.55 kB
import re
import streamlit as st
import textract
from pathlib import Path
import spacy
from spacy.matcher import Matcher
st.title('Find 13-digit ISBN Numbers')
st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')
nlp = spacy.blank('xx')
nlp.max_length = 1200000
matcher = Matcher(nlp.vocab)
pattern = [{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True},
{'ORTH': '-'},
{'IS_DIGIT': True}]
matcher.add("ISBN",[pattern])
regex = re.compile('(\d{13})')
isbn = """"""
uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
for uploaded_file in uploaded_files:
file_type = uploaded_file.type
#TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile
Path(uploaded_file.name).write_bytes(uploaded_file.read())
text = textract.process(uploaded_file.name)
text = text.decode('utf-8')
re_matches = regex.findall(text)
for match in re_matches:
isbn += match + '\n'
doc = nlp(text)
matches = matcher(doc)
st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
for match_id,start,end in matches:
isbn += f"{doc[start:end]}\n"
st.download_button('Download', isbn, 'text/plain')