PatentClaimsExtraction

Paused

File size: 7,730 Bytes

import streamlit as st  
import gradio as gr  
import numpy as np  
import whisper  
import os  
import streamlit.components.v1 as components  
import tempfile  
import io  
import requests  
import json  
import openai  
from transformers import AutoConfig, AutoTokenizer, AutoModel  
from summarizer import Summarizer  
  
# File upload size bug?  
  
# st.set_option('server.maxUploadSize', 500)  
  
# Initialize session state for claims_extraction  
st.session_state.claims_extraction = ""  
  
# Initialize session state for userinput  
st.session_state.userinput = ""  # Initialize user input  
  
# Define a function to split text into chunks  
def chunk_text(text, chunk_size=2000):  
    chunks = []  
    start = 0  
    while start < len(text):  
        end = start + chunk_size  
        chunk = text[start:end]  
        chunks.append(chunk)  
        start = end  
    return chunks  
  
# Streamlit Session State  
if 'learning_objectives' not in st.session_state:  
    st.session_state.learning_objectives = ""  
  
# Initialize the Whisper model outside the button  
if 'whisper_model' not in st.session_state:  
    st.session_state.whisper_model = whisper.load_model("base")  
  
# Streamlit Interface  
  
markdown_text = """  
# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.   
Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).   
- Save time and effort when ideating for your future business.  Expect latency upwards of 2.5 hours !
"""  
  
# Render the Markdown content  
st.markdown(markdown_text)  
  
# API Key Input  
api_key = st.text_input("Enter your OpenAI API Key:", type="password")  
  
# Audio Upload  
st.write("Upload an audio file (supported formats: mp3, wav, ogg)")  
audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")  
  
audio_data = None  
  
if audio_file is not None:  
    audio_data = audio_file.read()  
    st.audio(audio_data, format="audio/wav")  
    st.info("Transcribing...")  
    st.success("Transcription complete")  
  
# Moved the submit_button check here  
if st.button('Start Transcription'):  
    model = st.session_state.whisper_model  
  
    if audio_data:  
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:  
            audio_file.write(audio_data)  
            audio_file_path = audio_file.name  
        st.audio(audio_file_path, format="audio/wav")  
        st.info("Transcribing...")  
        st.success("Transcription complete")  
        result = model.transcribe(audio_file_path)  
        transcript = result['text']  # Define the 'transcript' variable  
  
        with st.expander("See transcript"):  
            st.markdown(transcript)  
  
    # Display the Whisper transcription    
    if 'transcript' in locals():    
        st.text("Transcription:")    
        st.text(transcript)    
  
        # Update the user input field with the transcription  
        st.session_state.userinput = st.text_area("Input Text:", transcript)  
  
# Model Selection Dropdown  
model_choice = st.selectbox(  
    "Select the model you want to use:",  
    ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]  
)  
  
# Context, Subject, and Level  
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."  
  
# Initialize OpenAI API  
if api_key:  
    openai.api_key = api_key  
  
# Learning Objectives  
st.write("### Patentable Claims:")  
  
# Initialize autogenerated objectives  
claims_extraction = ""  
  
# Initialize status placeholder  
learning_status_placeholder = st.empty()  
  
disable_button_bool = False  
  
if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):  
    # Split the user input into chunks  
    input_chunks = chunk_text(userinput)  
  
    # Initialize a variable to store the extracted claims  
    all_extracted_claims = ""  
  
    for chunk in input_chunks:  
        # Display status message for the current chunk  
        learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")  
  
        # API call to generate objectives for the current chunk  
        claims_extraction_response = openai.ChatCompletion.create(  
            model=model_choice,  
            messages=[  
                {"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}  
            ]  
        )  
  
        # Extract the generated objectives from the API response  
        claims_extraction = claims_extraction_response['choices'][0]['message']['content']  
  
        # Append the extracted claims from the current chunk to the overall results  
        all_extracted_claims += claims_extraction.strip()  
  
    # Save the generated objectives to session state  
    st.session_state.claims_extraction = all_extracted_claims  
  
    # Display generated objectives for all chunks  
    learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")  
  
# Get the extracted claims from Streamlit's session state  
claims_extracted = st.session_state.claims_extraction  
  
# Display the Extracted Claims    
if 'claims_extracted' in st.session_state:    
    st.text("Extracted Claims:")    
    st.text(st.session_state.claims_extracted)    
  
# Define the BERT-based model name  
model_name = 'nlpaueb/legal-bert-base-uncased'  
  
# Initialize BERT-based model and tokenizer  
custom_config = AutoConfig.from_pretrained(model_name)  
custom_config.output_hidden_states = True  
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)  
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)  
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)  
print('Using model {}\n'.format(model_name))  
  
# Get the extracted claims from Streamlit's session state  
# claims_extracted = st.session_state.claims_extraction #moved up  
  
# Define the chunk size  
chunk_size = 350  
  
# Split the extracted claims into chunks  
if isinstance(claims_extracted, str):  
    chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]  
else:  
    chunks = []  
  
# Process each chunk with the BERT-based model  
summaries = []  
for chunk in chunks:  
    summary = bert_legal_model(chunk, min_length=20, ratio=0.9)  
    summaries.append(summary)  
  
# Now you have a list of summaries for each chunk  
# You can access them using `summaries[0]`, `summaries[1]`, etc.  
# After generating summaries  
for i, summary in enumerate(summaries):  
    st.write(f"### Summary {i+1}")  
    st.write(summary)  
  
# Display the BERT Summaries    
if summaries:    
    st.text("BERT Summaries:")    
    for i, summary in enumerate(summaries):    
        st.text(f"Summary {i + 1}:\n{summary}")    
  
# Citation for the GitHub repo  
st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & [Taylor](https://huggingface.co/Cloudfaith) [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)