alidemo / streamlit.py
alifatmi's picture
Add application file
858bb9d
raw history blame
No virus
3.96 kB
%%writefile app.py
import streamlit as st
st.title("HEALTHQUERY")
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
AdamW, get_linear_schedule_with_warmup, \
TrainingArguments, BeamScorer, Trainer
import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
RandomSampler, SequentialSampler
from IPython.display import clear_output
from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback
from transformers import pipeline
#summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer_knnkar = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
import os
DEBUG = False
INPUT_DIR = 'articles'
USE_APEX = True
APEX_OPT_LEVEL = 'O1'
MODEL = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}
UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training
SPECIAL_TOKENS = { "bos_token": "<|BOS|>",
"eos_token": "<|EOS|>",
"unk_token": "<|UNK|>",
"pad_token": "<|PAD|>",
"sep_token": "<|SEP|>"}
MAXLEN = 256 #{768, 1024, 1280, 1600}
TRAIN_SIZE = 0.8
if USE_APEX:
TRAIN_BATCHSIZE = 16
BATCH_UPDATE = 128
else:
TRAIN_BATCHSIZE = 8
BATCH_UPDATE = 256
EPOCHS = 3
LR = 5e-4
EPS = 1e-8
WARMUP_STEPS = 1e2
SEED = 2020
DEVIDE_BY = 20
os.environ['WANDB_DISABLED'] = 'true'
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained('/content/drive/MyDrive/All models/biogpt')
input_text = st.text_input("Please Provide your text:")
title = input_text
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token']
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)
device = torch.device("cuda")
model.cuda()
model.eval();
from heapq import nsmallest
# Generate text
if len(input_text)>0:
sample_outputs = model.generate(generated,
do_sample=True,
max_length=MAXLEN,
top_k=10,
top_p=0.7,
temperature=0.5,
repetition_penalty=2.0,
num_return_sequences=1
)
# Initialize an empty list to store the perplexity and text pairs
perplexity_text_pairs = []
for i, sample_output in enumerate(sample_outputs):
text = tokenizer.decode(sample_output,skip_special_tokens=True)
a = len(title)+25
st.write(a)
st.write(("{}: {}\n\n".format(i+1, text[a:])))
# all questions print in above cod
bart_Val=text[a:]
#x=summarizer(bart_Val, max_length=200, min_length=30, do_sample=False)
#st.write('-------Bart summarization-----')
#st.write(x[0]['summary_text'])
#summary=summarizer_knnkar(bart_Val, max_length=200, min_length=30, do_sample=False)
#st.write('-------MEETING_SUMMARY-----')
#st.write(summary[0]['summary_text'])
distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False)
st.write('-------distilbart_cnn_12-6 model -----')
st.write(distl[0]['summary_text'])
else:
st.write('Welcome to GPT2')
# Create a "Regenerate" button
# Display output