Spaces:

crocidoc
/

testitest

Sleeping

App Files Files Community

crocidoc commited on Sep 28, 2022

Commit

cc83a1d

1 Parent(s): 0f9a5e8

initial commit

Browse files

Files changed (3) hide show

app.py +111 -0
requirements.txt +6 -0
text_transformation_tools.py +55 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+import text_transformation_tools as ttt
+from transformers import pipeline
+import plotly.express as px
+def read_pdf(file):
+    text = ttt.pdf_to_text(uploaded_file)
+    return text
+def analyze_text(paragraphs, topics, model, mode, min_chars, prob):
+    with st.spinner('Loading model'):
+        classifier = pipeline('zero-shot-classification', model=model)
+    relevant_parts = {}
+    for topic in topics:
+        relevant_parts[topic] = []
+    if mode == 'paragraphs':
+        text = paragraphs
+    elif mode == 'sentences':
+        text = []
+        for paragraph in paragraphs:
+            for sentence in paragraph.split('.'):
+                text.append(sentence)
+    min_chars = min_chars
+    min_score = prob
+    with st.spinner('Analyzing text...'):
+        counter = 0
+        counter_rel = 0
+        counter_tot = len(text)
+        with st.empty():
+            for sequence_to_classify in text:
+                cleansed_sequence = sequence_to_classify.replace('\n', '').replace('  ', ' ')
+                if len(cleansed_sequence) >= min_chars:
+                    classified = classifier(cleansed_sequence, topics, multi_label=True)
+                    for idx in range(len(classified['scores'])):
+                        if classified['scores'][idx] >= min_score:
+                            relevant_parts[classified['labels'][idx]].append(sequence_to_classify)
+                            counter_rel += 1
+                counter += 1
+                st.write('Analyzed {} of {} {}. Found {} relevant {} so far.'.format(counter, counter_tot, mode, counter_rel, mode))
+    return relevant_parts
+CHOICES = {
+    'facebook/bart-large-mnli': 'bart-large-mnli (very slow, english)',
+    'valhalla/distilbart-mnli-12-1': 'distilbart-mnli-12-1 (slow, english)',
+    'BaptisteDoyen/camembert-base-xnli': 'camembert-base-xnli (fast, english)',
+    'typeform/mobilebert-uncased-mnli': 'mobilebert-uncased-mnli (very fast, english)',
+    'Sahajtomar/German_Zeroshot': 'German_Zeroshot (slow, german)',
+    'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7': 'mDeBERTa-v3-base-xnli-multilingual-nli-2mil7 (fast, multilingual)'}
+def format_func(option):
+    return CHOICES[option]
+st.header('File and topics')
+uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
+topics = st.text_input(label='Enter coma separated sustainability topics of interest.', value = 'human rights, sustainability')
+st.header('Settings')
+col1, col2 = st.columns(2)
+with col1:
+    model = st.selectbox("Select model used to analyze pdf.", options=list(CHOICES.keys()), format_func=format_func, index=3)
+    mode = st.selectbox(label='Chose if you want to detect relevant paragraphs or sentences.', options=['paragraphs', 'sentences'])
+with col2:
+    min_chars = st.number_input(label='Minimum number of characters to analyze in a text', min_value=0, max_value=500, value=20)
+    probability = st.number_input(label='Minimum probability of being relevant to accept (in percent)', min_value=0, max_value=100, value=90)/100
+topics = topics.split(',')
+topics = [topic.strip() for topic in topics]
+st.header('Analyze PDF')
+if st.button('Analyze PDF'):
+    with st.spinner('Reading PDF...'):
+        text = read_pdf(uploaded_file)
+        page_count = ttt.count_pages(uploaded_file)
+        language = ttt.detect_language(' '.join(text))[0]
+    st.subheader('Overview')
+    st.write('Our pdf reader detected {} pages and {} paragraphs. We assume that the language of this text is "{}".'.format(page_count, len(text), language))
+    st.subheader('Analysis')
+    relevant_parts = analyze_text(text, topics, model, mode, min_chars, probability)
+    counts = [len(relevant_parts[topic]) for topic in topics]
+    fig = px.bar(x=topics, y=counts, title='Found {}s of Relevance'.format(mode))
+    st.plotly_chart(fig)
+    st.subheader('Relevant Passages')
+    st.write(relevant_parts)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers = {extras = ["torch"], version = "*"}
+pdfminer-six = "*"
+langid = "*"
+pandas = "*"
+streamlit = "*"
+plotly = "*"

text_transformation_tools.py ADDED Viewed

	@@ -0,0 +1,55 @@

+'''
+This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata
+It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more
+information visit https://github.com/bizres
+'''
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer
+from pdfminer.high_level import extract_text
+import fitz
+import langid
+langid.set_languages(['en', 'de','fr','it'])
+import pandas as pd
+def pdf_to_text(file):
+    '''
+    This function extracts text from a pdf.
+    Parameters:
+    path: path to pdf
+    '''
+    text = extract_text(file)
+    paragraphs = text.split('\n\n')
+    return paragraphs
+def detect_language(text):
+    '''
+    This function detects the language of a text using langid
+    '''
+    return langid.classify(text)
+def count_pages(pdf_file):
+    return len(list(extract_pages(pdf_file)))
+def pdf_text_to_sections(text):
+    '''
+    This function generates a pandas DataFrame from the extracted text. Each section
+    is provided with the page it is on and a section_index
+    '''
+    sections = []
+    page_nr = 0
+    section_index = 0
+    for page in text.split('\n\n'):
+        page_nr += 1
+        for section in page.split('\n'):
+            sections.append([page_nr, section_index, section])
+            section_index += 1
+    return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])