Spaces:
Runtime error
Runtime error
# Install Streamlit and pyngrok | |
!pip install -q streamlit | |
!pip install -q pyngrok | |
!pip install -q pdfplumber | |
!pip install -q transformers | |
!pip install -q tabula-py | |
!pip install -q openai | |
# Write the Streamlit app script | |
# Write the Streamlit app script | |
%%writefile app.py | |
import streamlit as st | |
import pdfplumber | |
import torch | |
from transformers import RobertaTokenizer, RobertaModel | |
import nltk | |
import openai | |
from torch import nn | |
import torch.nn.functional as F | |
from nltk.tokenize import sent_tokenize | |
# Download the 'punkt' package | |
nltk.download('punkt') | |
openai.api_key = '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' | |
# Define your model architecture | |
class Bert_model(nn.Module): | |
def __init__(self, hidden_size, dropout_rate): | |
super(Bert_model, self).__init__() | |
self.hidden_size = hidden_size | |
self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2') | |
self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True) | |
self.cls_dropout = nn.Dropout(dropout_rate) | |
self.cls_final = nn.Linear(hidden_size, 2, bias=True) | |
def forward(self, input_ids, attention_mask): | |
bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
bert_sequence_output = bert_outputs.last_hidden_state | |
bert_pooled_output = bert_sequence_output[:, 0, :] | |
pooled_output = self.cls_prj(bert_pooled_output) | |
pooled_output = self.cls_dropout(pooled_output) | |
logits = self.cls_final(pooled_output) | |
return logits | |
# Load the model | |
model_path = "/content/model.pt" # Replace with your actual model path | |
state_dict = torch.load(model_path) | |
device = "cuda" # or "cpu" if GPU is not available | |
# Instantiate the model architecture | |
model = Bert_model(hidden_size=768, dropout_rate=0.1) # Adjust the hidden size to match the saved model | |
model = nn.DataParallel(model) | |
model.load_state_dict(state_dict) | |
model = model.to(device) | |
model.eval() | |
# Load the tokenizer | |
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2') | |
# Function to preprocess PDF text | |
def preprocess_pdf(pdf_path, tokenizer): | |
with pdfplumber.open(pdf_path) as pdf: | |
text = " ".join([page.extract_text() for page in pdf.pages[2:]]) | |
tokenized_text = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
padding='max_length', | |
return_attention_mask=True | |
) | |
input_ids = torch.tensor([tokenized_text['input_ids']]) | |
attention_mask = torch.tensor([tokenized_text['attention_mask']]) | |
return input_ids, attention_mask, text | |
def translate_text(text, target_language): | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that translates English text to other languages."}, | |
{"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'}, | |
], | |
) | |
return response.choices[0].message['content'] | |
def explain_term(term): | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a helpful assistant that provides definitions." | |
}, | |
{ | |
"role": "user", | |
"content": f"Explain the term: {term}" | |
}, | |
], | |
) | |
return response['choices'][0]['message']['content'] | |
# Streamlit code to upload file | |
st.title('FinQA (Financial Question-Answering)') | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
# Select language | |
language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese']) | |
if uploaded_file is not None: | |
with open("temp.pdf", "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer) | |
st.write('File successfully uploaded and processed') | |
# Ask a question | |
question = st.text_input("Enter your question:") | |
if question: | |
sentences = sent_tokenize(text) | |
predictions = [] | |
for sentence in sentences: | |
inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512) | |
input_ids = inputs['input_ids'].to(device) | |
attention_mask = inputs['attention_mask'].to(device) | |
with torch.no_grad(): | |
outputs = model(input_ids, attention_mask) | |
probabilities = F.softmax(outputs, dim=1) | |
max_value, max_index = torch.max(probabilities, dim=1) | |
prediction = max_index.item() | |
predictions.append((sentence, prediction, probabilities[0].tolist())) | |
predictions.sort(key=lambda pair: pair[1], reverse=True) | |
top_5_sentences = predictions[:13] | |
#st.write("Top 5 Relevant Sentences:") | |
#for sentence, prediction, probabilities in top_5_sentences: | |
#st.write(f"Sentence: {sentence}, Prediction: {prediction}, Probability: {probabilities[prediction]}") | |
# Prepare the chat history with the top 3 sentences | |
chat_history = "\n".join([sentence[0] for sentence in top_5_sentences]) | |
# Ask the question using OpenAI API | |
openai.api_key = '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' # Replace with your actual OpenAI API key | |
response = openai.ChatCompletion.create( | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."}, | |
{"role": "user", "content": chat_history}, | |
{"role": "user", "content": question}, | |
] | |
) | |
if language != 'English': | |
response_content = translate_text(response.choices[0].message['content'], language) | |
else: | |
response_content = response.choices[0].message['content'] | |
st.text("Answer: " + response_content) | |
term = st.text_input("Enter a term you want to define:") | |
if term: | |
# Define the term using OpenAI API | |
definition = explain_term(term) | |
if language != 'English': | |
definition = translate_text(definition, language) | |
st.text("Definition: " + definition) |