|
import streamlit as st |
|
st.title("HEALTHQUERY") |
|
import os |
|
import io |
|
import requests |
|
import numpy as np |
|
import pandas as pd |
|
import re |
|
import zipfile |
|
import random |
|
import time |
|
import csv |
|
import datetime |
|
from itertools import compress |
|
from collections import Counter, defaultdict |
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \ |
|
AdamW, get_linear_schedule_with_warmup, \ |
|
TrainingArguments, BeamScorer, Trainer |
|
|
|
import torch |
|
from torch.utils.data import Dataset, random_split, DataLoader, \ |
|
RandomSampler, SequentialSampler |
|
|
|
|
|
from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback |
|
from transformers import pipeline |
|
|
|
|
|
summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") |
|
|
|
import os |
|
DEBUG = False |
|
|
|
INPUT_DIR = 'articles' |
|
|
|
USE_APEX = True |
|
APEX_OPT_LEVEL = 'O1' |
|
|
|
MODEL = 'gpt2' |
|
|
|
UNFREEZE_LAST_N = 6 |
|
|
|
SPECIAL_TOKENS = { "bos_token": "<|BOS|>", |
|
"eos_token": "<|EOS|>", |
|
"unk_token": "<|UNK|>", |
|
"pad_token": "<|PAD|>", |
|
"sep_token": "<|SEP|>"} |
|
|
|
MAXLEN = 256 |
|
|
|
TRAIN_SIZE = 0.8 |
|
|
|
if USE_APEX: |
|
TRAIN_BATCHSIZE = 16 |
|
BATCH_UPDATE = 128 |
|
else: |
|
TRAIN_BATCHSIZE = 8 |
|
BATCH_UPDATE = 256 |
|
|
|
EPOCHS = 3 |
|
LR = 5e-4 |
|
EPS = 1e-8 |
|
WARMUP_STEPS = 1e2 |
|
|
|
SEED = 2020 |
|
|
|
|
|
DEVIDE_BY = 20 |
|
|
|
os.environ['WANDB_DISABLED'] = 'true' |
|
|
|
|
|
|
|
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt") |
|
model = BioGptForCausalLM.from_pretrained(' alidemo/pytorch_model.bin') |
|
|
|
|
|
|
|
input_text = st.text_input("Please Provide your text:") |
|
title = input_text |
|
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token'] |
|
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) |
|
device = torch.device("cuda") |
|
generated = generated.to(device) |
|
device = torch.device("cuda") |
|
model.cuda() |
|
model.eval(); |
|
from heapq import nsmallest |
|
|
|
|
|
|
|
if len(input_text)>0: |
|
sample_outputs = model.generate(generated, |
|
do_sample=True, |
|
max_length=MAXLEN, |
|
top_k=10, |
|
top_p=0.7, |
|
temperature=0.5, |
|
repetition_penalty=2.0, |
|
num_return_sequences=1 |
|
) |
|
|
|
|
|
|
|
perplexity_text_pairs = [] |
|
|
|
|
|
for i, sample_output in enumerate(sample_outputs): |
|
text = tokenizer.decode(sample_output,skip_special_tokens=True) |
|
a = len(title)+25 |
|
st.write(a) |
|
st.write(("{}: {}\n\n".format(i+1, text[a:]))) |
|
|
|
bart_Val=text[a:] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False) |
|
st.write('-------distilbart_cnn_12-6 model -----') |
|
st.write(distl[0]['summary_text']) |
|
|
|
|
|
|
|
else: |
|
st.write('Welcome to GPT2') |
|
|
|
|
|
|
|
|
|
|
|
|