Spaces:

crocidoc
/

test

Runtime error

App Files Files Community

crocidoc commited on Sep 28, 2022

Commit

04de999

1 Parent(s): dae0599

initial commit

Browse files

Files changed (3) hide show

app.py +111 -0
requirements.txt +62 -0
text_transformation_tools.py +55 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+import text_transformation_tools as ttt
+from transformers import pipeline
+import plotly.express as px
+def read_pdf(file):
+    text = ttt.pdf_to_text(uploaded_file)
+    return text
+def analyze_text(paragraphs, topics, model, mode, min_chars, prob):
+    with st.spinner('Loading model'):
+        classifier = pipeline('zero-shot-classification', model=model)
+    relevant_parts = {}
+    for topic in topics:
+        relevant_parts[topic] = []
+    if mode == 'paragraphs':
+        text = paragraphs
+    elif mode == 'sentences':
+        text = []
+        for paragraph in paragraphs:
+            for sentence in paragraph.split('.'):
+                text.append(sentence)
+    min_chars = min_chars
+    min_score = prob
+    with st.spinner('Analyzing text...'):
+        counter = 0
+        counter_rel = 0
+        counter_tot = len(text)
+        with st.empty():
+            for sequence_to_classify in text:
+                cleansed_sequence = sequence_to_classify.replace('\n', '').replace('  ', ' ')
+                if len(cleansed_sequence) >= min_chars:
+                    classified = classifier(cleansed_sequence, topics, multi_label=True)
+                    for idx in range(len(classified['scores'])):
+                        if classified['scores'][idx] >= min_score:
+                            relevant_parts[classified['labels'][idx]].append(sequence_to_classify)
+                            counter_rel += 1
+                counter += 1
+                st.write('Analyzed {} of {} {}. Found {} relevant {} so far.'.format(counter, counter_tot, mode, counter_rel, mode))
+    return relevant_parts
+CHOICES = {
+    'facebook/bart-large-mnli': 'bart-large-mnli (very slow, english)',
+    'valhalla/distilbart-mnli-12-1': 'distilbart-mnli-12-1 (slow, english)',
+    'BaptisteDoyen/camembert-base-xnli': 'camembert-base-xnli (fast, english)',
+    'typeform/mobilebert-uncased-mnli': 'mobilebert-uncased-mnli (very fast, english)',
+    'Sahajtomar/German_Zeroshot': 'German_Zeroshot (slow, german)',
+    'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7': 'mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 (fast, multilingual)'}
+def format_func(option):
+    return CHOICES[option]
+st.header('File and topics')
+uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
+topics = st.text_input(label='Enter coma separated sustainability topics of interest.', value = 'human rights, sustainability')
+st.header('Settings')
+col1, col2 = st.columns(2)
+with col1:
+    model = st.selectbox("Select model used to analyze pdf.", options=list(CHOICES.keys()), format_func=format_func, index=3)
+    mode = st.selectbox(label='Chose if you want to detect relevant paragraphs or sentences.', options=['paragraphs', 'sentences'])
+with col2:
+    min_chars = st.number_input(label='Minimum number of characters to analyze in a text', min_value=0, max_value=500, value=20)
+    probability = st.number_input(label='Minimum probability of being relevant to accept (in percent)', min_value=0, max_value=100, value=90)/100
+topics = topics.split(',')
+topics = [topic.strip() for topic in topics]
+st.header('Analyze PDF')
+if st.button('Analyze PDF'):
+    with st.spinner('Reading PDF...'):
+        text = read_pdf(uploaded_file)
+        page_count = ttt.count_pages(uploaded_file)
+        language = ttt.detect_language(' '.join(text))[0]
+    st.subheader('Overview')
+    st.write('Our pdf reader detected {} pages and {} paragraphs. We assume that the language of this text is "{}".'.format(page_count, len(text), language))
+    st.subheader('Analysis')
+    relevant_parts = analyze_text(text, topics, model, mode, min_chars, probability)
+    counts = [len(relevant_parts[topic]) for topic in topics]
+    fig = px.bar(x=topics, y=counts, title='Found {}s of Relevance'.format(mode))
+    st.plotly_chart(fig)
+    st.subheader('Relevant Passages')
+    st.write(relevant_parts)

requirements.txt ADDED Viewed

	@@ -0,0 +1,62 @@

+altair==4.2.0
+attrs==22.1.0
+blinker==1.5
+cachetools==5.2.0
+certifi==2022.9.14
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+commonmark==0.9.1
+cryptography==38.0.1
+decorator==5.1.1
+entrypoints==0.4
+filelock==3.8.0
+gitdb==4.0.9
+GitPython==3.1.27
+huggingface-hub==0.9.1
+idna==3.4
+importlib-metadata==4.12.0
+Jinja2==3.1.2
+jsonschema==4.16.0
+langid==1.1.6
+MarkupSafe==2.1.1
+numpy==1.23.3
+packaging==21.3
+pandas==1.5.0
+pdfminer.six==20220524
+Pillow==9.2.0
+plotly==5.10.0
+protobuf==3.20.1
+pyarrow==9.0.0
+pycparser==2.21
+pydeck==0.8.0b3
+Pygments==2.13.0
+Pympler==1.0.1
+PyMuPDF==1.20.2
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.2.1
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+regex==2022.9.13
+requests==2.28.1
+rich==12.5.1
+semver==2.13.0
+six==1.16.0
+smmap==5.0.0
+streamlit==1.13.0
+tenacity==8.1.0
+tokenizers==0.12.1
+toml==0.10.2
+toolz==0.12.0
+torch==1.12.1
+tornado==6.2
+tqdm==4.64.1
+transformers==4.22.1
+typing_extensions==4.3.0
+tzlocal==4.2
+urllib3==1.26.12
+validators==0.20.0
+watchdog==2.1.9
+zipp==3.8.1

text_transformation_tools.py ADDED Viewed

	@@ -0,0 +1,55 @@

+'''
+This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata
+It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more
+information visit https://github.com/bizres
+'''
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer
+from pdfminer.high_level import extract_text
+import fitz
+import langid
+langid.set_languages(['en', 'de','fr','it'])
+import pandas as pd
+def pdf_to_text(file):
+    '''
+    This function extracts text from a pdf.
+    Parameters:
+    path: path to pdf
+    '''
+    text = extract_text(file)
+    paragraphs = text.split('\n\n')
+    return paragraphs
+def detect_language(text):
+    '''
+    This function detects the language of a text using langid
+    '''
+    return langid.classify(text)
+def count_pages(pdf_file):
+    return len(list(extract_pages(pdf_file)))
+def pdf_text_to_sections(text):
+    '''
+    This function generates a pandas DataFrame from the extracted text. Each section
+    is provided with the page it is on and a section_index
+    '''
+    sections = []
+    page_nr = 0
+    section_index = 0
+    for page in text.split('\n\n'):
+        page_nr += 1
+        for section in page.split('\n'):
+            sections.append([page_nr, section_index, section])
+            section_index += 1
+    return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])