Spaces:

ajanco
/

isbn-finder

Runtime error

isbn-finder / app.py

apjanco

handle ISBN without hyphens

907a067 almost 2 years ago

No virus

1.55 kB

	import re
	import streamlit as st
	import textract
	from pathlib import Path
	import spacy
	from spacy.matcher import Matcher

	st.title('Find 13-digit ISBN Numbers')
	st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')

	nlp = spacy.blank('xx')
	nlp.max_length = 1200000
	matcher = Matcher(nlp.vocab)
	pattern = [{'IS_DIGIT': True},
	{'ORTH': '-'},
	{'IS_DIGIT': True},
	{'ORTH': '-'},
	{'IS_DIGIT': True},
	{'ORTH': '-'},
	{'IS_DIGIT': True},
	{'ORTH': '-'},
	{'IS_DIGIT': True}]
	matcher.add("ISBN",[pattern])

	regex = re.compile('(\d{13})')

	isbn = """"""
	uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)

	for uploaded_file in uploaded_files:
	file_type = uploaded_file.type
	#TODO process file in memory without writing to disk or write tempfile.NamedTemporaryFile
	Path(uploaded_file.name).write_bytes(uploaded_file.read())
	text = textract.process(uploaded_file.name)
	text = text.decode('utf-8')
	re_matches = regex.findall(text)
	for match in re_matches:
	isbn += match + '\n'

	doc = nlp(text)
	matches = matcher(doc)
	st.write(f'Found {len(matches)+len(re_matches)} ISBN numbers')
	for match_id,start,end in matches:
	isbn += f"{doc[start:end]}\n"

	st.download_button('Download', isbn, 'text/plain')