Spaces:

ajanco
/

isbn-finder

Runtime error

apjanco commited on Jul 26, 2022

Commit

5ef0d94

•

1 Parent(s): 5e06212

first commit!

Files changed (3) hide show

app.py ADDED Viewed

+import streamlit as st
+import textract
+from pathlib import Path
+import spacy
+from spacy.matcher import Matcher
+st.title('Find 13-digit ISBN Numbers')
+st.image('https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg/640px-A_Small_Book_of_Designs_copy_A_object_21_The_First_Book_of_Urizen_plate_5.jpg')
+nlp = spacy.blank('xx')
+nlp.max_length = 1200000
+matcher = Matcher(nlp.vocab)
+pattern = [{'IS_DIGIT': True},
+           {'ORTH': '-'},
+           {'IS_DIGIT': True},
+           {'ORTH': '-'},
+           {'IS_DIGIT': True},
+           {'ORTH': '-'},
+           {'IS_DIGIT': True},
+           {'ORTH': '-'},
+           {'IS_DIGIT': True}]
+matcher.add("ISBN",[pattern])
+isbn = """isbn\n"""
+uploaded_files = st.file_uploader("Select files to process", accept_multiple_files=True)
+for uploaded_file in uploaded_files:
+    file_type = uploaded_file.type
+    Path(uploaded_file.name).write_bytes(uploaded_file.read())
+    text = textract.process(uploaded_file.name)
+    text = text.decode('utf-8')
+    doc = nlp(text)
+    matches = matcher(doc)
+    st.write(f'Found {len(matches)} ISBN numbers')
+    for match_id,start,end in matches:
+        isbn += f"{doc[start:end]}\n"
+st.download_button('Download CSV', isbn, 'text/csv')

packages.txt ADDED Viewed

+python-dev
+libxml2-dev
+libxslt1-dev
+antiword
+unrtf
+poppler-utils
+pstotext
+tesseract-ocr
+flac
+ffmpeg
+lame
+libmad0
+libsox-fmt-mp3
+sox
+libjpeg-dev
+swig

requirements.txt ADDED Viewed

+streamlit==1.2.0
+textract==1.6.5
+spacy==3.4.0