Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pdfplumber | |
import torch | |
from transformers import RobertaTokenizer, RobertaModel | |
import nltk | |
import openai | |
from torch import nn | |
import torch.nn.functional as F | |
from nltk.tokenize import sent_tokenize | |
import os | |
print(os.listdir('.')) | |
nltk.download('punkt') | |
class Bert_model(nn.Module): | |
def __init__(self, hidden_size, dropout_rate): | |
super(Bert_model, self).__init__() | |
self.hidden_size = hidden_size | |
self.bert = RobertaModel.from_pretrained('deepset/roberta-base-squad2') | |
self.cls_prj = nn.Linear(hidden_size, hidden_size, bias=True) | |
self.cls_dropout = nn.Dropout(dropout_rate) | |
self.cls_final = nn.Linear(hidden_size, 2, bias=True) | |
def forward(self, input_ids, attention_mask): | |
bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
bert_sequence_output = bert_outputs.last_hidden_state | |
bert_pooled_output = bert_sequence_output[:, 0, :] | |
pooled_output = self.cls_prj(bert_pooled_output) | |
pooled_output = self.cls_dropout(pooled_output) | |
logits = self.cls_final(pooled_output) | |
return logits | |
model_path = "model.pt" | |
state_dict = torch.load(model_path) | |
device = torch.device("cuda:0") | |
model = Bert_model(hidden_size=768, dropout_rate=0.1) | |
model = nn.DataParallel(model) | |
model.load_state_dict(state_dict) | |
model = model.to(device) | |
model.eval() | |
tokenizer = RobertaTokenizer.from_pretrained('deepset/roberta-base-squad2') | |
def preprocess_pdf(pdf_path, tokenizer): | |
with pdfplumber.open(pdf_path) as pdf: | |
text = " ".join([page.extract_text() for page in pdf.pages[2:]]) | |
tokenized_text = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
padding='max_length', | |
return_attention_mask=True | |
) | |
input_ids = torch.tensor([tokenized_text['input_ids']]) | |
attention_mask = torch.tensor([tokenized_text['attention_mask']]) | |
return input_ids, attention_mask, text | |
def translate_text(text, target_language): | |
response = openai.ChatCompletion.create( | |
model="gpt-4-1106-preview", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that translates English text to other languages."}, | |
{"role": "user", "content": f'Translate the following English text to {target_language}: "{text}"'}, | |
], | |
) | |
return response.choices[0].message['content'] | |
def explain_term(term): | |
response = openai.ChatCompletion.create( | |
model="gpt-4-1106-preview", | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a helpful assistant that provides definitions." | |
}, | |
{ | |
"role": "user", | |
"content": f"Explain the term: {term}" | |
}, | |
], | |
) | |
return response['choices'][0]['message']['content'] | |
st.title('FinChat') | |
# api_key = st.text_input("Enter your OpenAI API key:", type="password") | |
# if api_key: | |
# try: | |
# openai.api_key = api_key | |
# openai.ChatCompletion.create( | |
# model="gpt-4-1106-preview", | |
# messages=[ | |
# {"role": "system", "content": "You are a helpful assistant."}, | |
# {"role": "user", "content": "Hello"}, | |
# ], | |
# ) | |
# st.success("API key is valid!") | |
# except Exception as e: | |
# st.error(f"Failed to validate API key: {e}") | |
# else: | |
# st.warning("Please enter your OpenAI API key.") | |
api_key = st.secrets["api_key"] | |
openai.api_key = api_key | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
language = st.selectbox('Select your language', ['English', 'French','Chinese','Korean','Spanish','German','Japanese']) | |
if uploaded_file is not None: | |
with open("temp.pdf", "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
input_ids, attention_mask, text = preprocess_pdf("temp.pdf", tokenizer) | |
st.write('File successfully uploaded and processed') | |
question = st.text_input("Enter your question:") | |
if question: | |
sentences = sent_tokenize(text) | |
predictions = [] | |
for sentence in sentences: | |
inputs = tokenizer.encode_plus(question, sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=512) | |
input_ids = inputs['input_ids'].to(device) | |
attention_mask = inputs['attention_mask'].to(device) | |
with torch.no_grad(): | |
outputs = model(input_ids, attention_mask) | |
probabilities = F.softmax(outputs, dim=1) | |
max_value, max_index = torch.max(probabilities, dim=1) | |
prediction = max_index.item() | |
predictions.append((sentence, prediction, probabilities[0].tolist())) | |
predictions.sort(key=lambda pair: pair[1], reverse=True) | |
top_5_sentences = predictions[:13] | |
chat_history = "\n".join([sentence[0] for sentence in top_5_sentences]) | |
response = openai.ChatCompletion.create( | |
model="gpt-4-1106-preview", | |
messages=[ | |
{"role": "system", "content": "You are a helpful generator which read the short paragraphs and answer the question."}, | |
{"role": "user", "content": chat_history}, | |
{"role": "user", "content": question}, | |
] | |
) | |
if language != 'English': | |
response_content = translate_text(response.choices[0].message['content'], language) | |
else: | |
response_content = response.choices[0].message['content'] | |
st.text("Answer: " + response_content) | |
term = st.text_input("Enter a term you want to define:") | |
if term: | |
definition = explain_term(term) | |
if language != 'English': | |
definition = translate_text(definition, language) | |
st.text("Definition: " + definition) |