PatentClaimsExtraction

Paused

App Files Files Community

PatentClaimsExtraction / app.py

Tonic

Update app.py

acd3106 over 1 year ago

raw

history blame contribute delete

7.73 kB

	import streamlit as st
	import gradio as gr
	import numpy as np
	import whisper
	import os
	import streamlit.components.v1 as components
	import tempfile
	import io
	import requests
	import json
	import openai
	from transformers import AutoConfig, AutoTokenizer, AutoModel
	from summarizer import Summarizer

	# File upload size bug?

	# st.set_option('server.maxUploadSize', 500)

	# Initialize session state for claims_extraction
	st.session_state.claims_extraction = ""

	# Initialize session state for userinput
	st.session_state.userinput = "" # Initialize user input

	# Define a function to split text into chunks
	def chunk_text(text, chunk_size=2000):
	chunks = []
	start = 0
	while start < len(text):
	end = start + chunk_size
	chunk = text[start:end]
	chunks.append(chunk)
	start = end
	return chunks

	# Streamlit Session State
	if 'learning_objectives' not in st.session_state:
	st.session_state.learning_objectives = ""

	# Initialize the Whisper model outside the button
	if 'whisper_model' not in st.session_state:
	st.session_state.whisper_model = whisper.load_model("base")

	# Streamlit Interface

	markdown_text = """
	# 👋🏻Welcome to [Team](https://huggingface.co/TeamTonic) [Tonic](https://huggingface.co/Tonic) 's Patentable Claims Extractor.
	Here you can input audio and text and extract patentable claims from these conversational inputs using [LegalBert](nlpaueb/legal-bert-base-uncased).
	- Save time and effort when ideating for your future business. Expect latency upwards of 2.5 hours !
	"""

	# Render the Markdown content
	st.markdown(markdown_text)

	# API Key Input
	api_key = st.text_input("Enter your OpenAI API Key:", type="password")

	# Audio Upload
	st.write("Upload an audio file (supported formats: mp3, wav, ogg)")
	audio_file = st.file_uploader("Choose an audio file", type=["mp3", "wav", "ogg"], key="audio_file")

	audio_data = None

	if audio_file is not None:
	audio_data = audio_file.read()
	st.audio(audio_data, format="audio/wav")
	st.info("Transcribing...")
	st.success("Transcription complete")

	# Moved the submit_button check here
	if st.button('Start Transcription'):
	model = st.session_state.whisper_model

	if audio_data:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file:
	audio_file.write(audio_data)
	audio_file_path = audio_file.name
	st.audio(audio_file_path, format="audio/wav")
	st.info("Transcribing...")
	st.success("Transcription complete")
	result = model.transcribe(audio_file_path)
	transcript = result['text'] # Define the 'transcript' variable

	with st.expander("See transcript"):
	st.markdown(transcript)

	# Display the Whisper transcription
	if 'transcript' in locals():
	st.text("Transcription:")
	st.text(transcript)

	# Update the user input field with the transcription
	st.session_state.userinput = st.text_area("Input Text:", transcript)

	# Model Selection Dropdown
	model_choice = st.selectbox(
	"Select the model you want to use:",
	["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"]
	)

	# Context, Subject, and Level
	context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable."

	# Initialize OpenAI API
	if api_key:
	openai.api_key = api_key

	# Learning Objectives
	st.write("### Patentable Claims:")

	# Initialize autogenerated objectives
	claims_extraction = ""

	# Initialize status placeholder
	learning_status_placeholder = st.empty()

	disable_button_bool = False

	if st.session_state.userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool):
	# Split the user input into chunks
	input_chunks = chunk_text(userinput)

	# Initialize a variable to store the extracted claims
	all_extracted_claims = ""

	for chunk in input_chunks:
	# Display status message for the current chunk
	learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...")

	# API call to generate objectives for the current chunk
	claims_extraction_response = openai.ChatCompletion.create(
	model=model_choice,
	messages=[
	{"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."}
	]
	)

	# Extract the generated objectives from the API response
	claims_extraction = claims_extraction_response['choices'][0]['message']['content']

	# Append the extracted claims from the current chunk to the overall results
	all_extracted_claims += claims_extraction.strip()

	# Save the generated objectives to session state
	st.session_state.claims_extraction = all_extracted_claims

	# Display generated objectives for all chunks
	learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}")

	# Get the extracted claims from Streamlit's session state
	claims_extracted = st.session_state.claims_extraction

	# Display the Extracted Claims
	if 'claims_extracted' in st.session_state:
	st.text("Extracted Claims:")
	st.text(st.session_state.claims_extracted)

	# Define the BERT-based model name
	model_name = 'nlpaueb/legal-bert-base-uncased'

	# Initialize BERT-based model and tokenizer
	custom_config = AutoConfig.from_pretrained(model_name)
	custom_config.output_hidden_states = True
	custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
	custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
	bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
	print('Using model {}\n'.format(model_name))

	# Get the extracted claims from Streamlit's session state
	# claims_extracted = st.session_state.claims_extraction #moved up

	# Define the chunk size
	chunk_size = 350

	# Split the extracted claims into chunks
	if isinstance(claims_extracted, str):
	chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)]
	else:
	chunks = []

	# Process each chunk with the BERT-based model
	summaries = []
	for chunk in chunks:
	summary = bert_legal_model(chunk, min_length=20, ratio=0.9)
	summaries.append(summary)

	# Now you have a list of summaries for each chunk
	# You can access them using `summaries[0]`, `summaries[1]`, etc.
	# After generating summaries
	for i, summary in enumerate(summaries):
	st.write(f"### Summary {i+1}")
	st.write(summary)

	# Display the BERT Summaries
	if summaries:
	st.text("BERT Summaries:")
	for i, summary in enumerate(summaries):
	st.text(f"Summary {i + 1}:\n{summary}")

	# Citation for the GitHub repo
	st.markdown("<sub>This app was created by [Tonic](https://huggingface.co/tonic) with help from [MIND INTERFACES](https://huggingface.co/MIND-INTERFACES) & [Taylor](https://huggingface.co/Cloudfaith) [join us on discord](https://discord.gg/5RmtZVVfgQ) </sub>", unsafe_allow_html=True)