finchat222 / app.py
Jforeverss's picture
Rename demo123.py to app.py
239f43a
raw
history blame
6.42 kB
# Install Streamlit and pyngrok
!pip install -q streamlit
!pip install -q pyngrok
!pip install -q pdfplumber
!pip install -q transformers
!pip install -q tabula-py
!pip install -q openai
# Write the Streamlit app script
# Write the Streamlit app script
%%writefile app.py
import streamlit as st
import pdfplumber
import torch
from transformers import RobertaTokenizer, RobertaModel
import nltk
import openai
from torch import nn
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
# Download the 'punkt' package
nltk.download('punkt')
openai.api_key = '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'
# Define your model architecture
class Bert_model(nn.Module):
def __init__(self, hidden_size, dropout_rate):
super(Bert_model, self).__init__()
self.hidden_size = hidden_size
self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2')
self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True)
self.cls_dropout = nn.Dropout(dropout_rate)
self.cls_final = nn.Linear(hidden_size, 2, bias=True)
def forward(self, input_ids, attention_mask):
bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
bert_sequence_output = bert_outputs.last_hidden_state
bert_pooled_output = bert_sequence_output[:, 0, :]
pooled_output = self.cls_prj(bert_pooled_output)
pooled_output = self.cls_dropout(pooled_output)
logits = self.cls_final(pooled_output)
return logits
# Load the model
model_path = "/content/model.pt" # Replace with your actual model path
state_dict = torch.load(model_path)
device = "cuda" # or "cpu" if GPU is not available
# Instantiate the model architecture
model = Bert_model(hidden_size=768, dropout_rate=0.1) # Adjust the hidden size to match the saved model
model = nn.DataParallel(model)
model.load_state_dict(state_dict)
model = model.to(device)
model.eval()
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2')
# Function to preprocess PDF text
def preprocess_pdf(pdf_path, tokenizer):
with pdfplumber.open(pdf_path) as pdf:
text = " ".join([page.extract_text() for page in pdf.pages[2:]])
tokenized_text = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_attention_mask=True
)
input_ids = torch.tensor([tokenized_text['input_ids']])
attention_mask = torch.tensor([tokenized_text['attention_mask']])
return input_ids, attention_mask, text
def translate_text(text, target_language):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant that translates English text to other languages."},
{"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'},
],
)
return response.choices[0].message['content']
def explain_term(term):
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that provides definitions."
},
{
"role": "user",
"content": f"Explain the term: {term}"
},
],
)
return response['choices'][0]['message']['content']
# Streamlit code to upload file
st.title('FinQA (Financial Question-Answering)')
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
# Select language
language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese'])
if uploaded_file is not None:
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer)
st.write('File successfully uploaded and processed')
# Ask a question
question = st.text_input("Enter your question:")
if question:
sentences = sent_tokenize(text)
predictions = []
for sentence in sentences:
inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
with torch.no_grad():
outputs = model(input_ids, attention_mask)
probabilities = F.softmax(outputs, dim=1)
max_value, max_index = torch.max(probabilities, dim=1)
prediction = max_index.item()
predictions.append((sentence, prediction, probabilities[0].tolist()))
predictions.sort(key=lambda pair: pair[1], reverse=True)
top_5_sentences = predictions[:13]
#st.write("Top 5 Relevant Sentences:")
#for sentence, prediction, probabilities in top_5_sentences:
#st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}")
# Prepare the chat history with the top 3 sentences
chat_history = "\n".join([sentence[0] for sentence in top_5_sentences])
# Ask the question using OpenAI API
openai.api_key = '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' # Replace with your actual OpenAI API key
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."},
{"role": "user", "content": chat_history},
{"role": "user", "content": question},
]
)
if language != 'English':
response_content = translate_text(response.choices[0].message['content'], language)
else:
response_content = response.choices[0].message['content']
st.text("Answer: " + response_content)
term = st.text_input("Enter a term you want to define:")
if term:
# Define the term using OpenAI API
definition = explain_term(term)
if language != 'English':
definition = translate_text(definition, language)
st.text("Definition: " + definition)