import streamlit as st import gradio as gr import numpy as np import whisper import os import streamlit.components.v1 as components import tempfile import io import requests import json import openai from transformers import AutoConfig, AutoTokenizer, AutoModel from summarizer import Summarizer # File upload size bug? # st.set_option('server.maxUploadSize', 500) # Initialize session state for claims_extraction st.session_state.claims_extraction = "" # Initialize session state for userinput st.session_state.userinput = "" # Initialize user input # Define a function to split text into chunks def chunk_text(text, chunk_size=2000): chunks = [] start = 0 while start < len(text): end = start + chunk_size chunk = text[start:end] chunks.append(chunk) start = end return chunks # Streamlit Session State if 'learning_objectives' not in st.session_state: st.session_state.learning_objectives = "" # Initialize the Whisper model outside the button if 'whisper_model' not in st.session_state: st.session_state.whisper_model = whisper.load_model("base") # Streamlit Interface markdown_text = """ # 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor. Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased). - Save time and effort when ideating for your future business. Expect latency upwards of 2.5 hours ! """ # Render the Markdown content st.markdown(markdown_text) # API Key Input api_key = st.text_input("Enter your OpenAI API Key:", type="password") # Audio Upload st.write("Upload an audio file (supported formats: mp3, wav, ogg)") audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file") audio_data = None if audio_file is not None: audio_data = audio_file.read() st.audio(audio_data, format="audio/wav") st.info("Transcribing...") st.success("Transcription complete") # Moved the submit_button check here if st.button('Start Transcription'): model = st.session_state.whisper_model if audio_data: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file: audio_file.write(audio_data) audio_file_path = audio_file.name st.audio(audio_file_path, format="audio/wav") st.info("Transcribing...") st.success("Transcription complete") result = model.transcribe(audio_file_path) transcript = result['text'] # Define the 'transcript' variable with st.expander("See transcript"): st.markdown(transcript) # Display the Whisper transcription if 'transcript' in locals(): st.text("Transcription:") st.text(transcript) # Update the user input field with the transcription st.session_state.userinput = st.text_area("Input Text:", transcript) # Model Selection Dropdown model_choice = st.selectbox( "Select the model you want to use:", ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"] ) # Context, Subject, and Level context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable." # Initialize OpenAI API if api_key: openai.api_key = api_key # Learning Objectives st.write("### Patentable Claims:") # Initialize autogenerated objectives claims_extraction = "" # Initialize status placeholder learning_status_placeholder = st.empty() disable_button_bool = False if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool): # Split the user input into chunks input_chunks = chunk_text(userinput) # Initialize a variable to store the extracted claims all_extracted_claims = "" for chunk in input_chunks: # Display status message for the current chunk learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...") # API call to generate objectives for the current chunk claims_extraction_response = openai.ChatCompletion.create( model=model_choice, messages=[ {"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."} ] ) # Extract the generated objectives from the API response claims_extraction = claims_extraction_response['choices'][0]['message']['content'] # Append the extracted claims from the current chunk to the overall results all_extracted_claims += claims_extraction.strip() # Save the generated objectives to session state st.session_state.claims_extraction = all_extracted_claims # Display generated objectives for all chunks learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}") # Get the extracted claims from Streamlit's session state claims_extracted = st.session_state.claims_extraction # Display the Extracted Claims if 'claims_extracted' in st.session_state: st.text("Extracted Claims:") st.text(st.session_state.claims_extracted) # Define the BERT-based model name model_name = 'nlpaueb/legal-bert-base-uncased' # Initialize BERT-based model and tokenizer custom_config = AutoConfig.from_pretrained(model_name) custom_config.output_hidden_states = True custom_tokenizer = AutoTokenizer.from_pretrained(model_name) custom_model = AutoModel.from_pretrained(model_name, config=custom_config) bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) print('Using model {}\n'.format(model_name)) # Get the extracted claims from Streamlit's session state # claims_extracted = st.session_state.claims_extraction #moved up # Define the chunk size chunk_size = 350 # Split the extracted claims into chunks if isinstance(claims_extracted, str): chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)] else: chunks = [] # Process each chunk with the BERT-based model summaries = [] for chunk in chunks: summary = bert_legal_model(chunk, min_length=20, ratio=0.9) summaries.append(summary) # Now you have a list of summaries for each chunk # You can access them using `summaries[0]`, `summaries[1]`, etc. # After generating summaries for i, summary in enumerate(summaries): st.write(f"### Summary {i+1}") st.write(summary) # Display the BERT Summaries if summaries: st.text("BERT Summaries:") for i, summary in enumerate(summaries): st.text(f"Summary {i + 1}:\n{summary}") # Citation for the GitHub repo st.markdown("This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & [Taylor](https://huggingface.co/Cloudfaith) [join us on discord](https://discord.gg/5RmtZVVfgQ) ", unsafe_allow_html=True)