File size: 3,960 Bytes
858bb9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143


%%writefile app.py
import streamlit as st

st.title("HEALTHQUERY")
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output
from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback
from transformers import pipeline
#summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer_knnkar = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

import os
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}

MAXLEN          = 256  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 16
    BATCH_UPDATE    = 128
else:
    TRAIN_BATCHSIZE = 8
    BATCH_UPDATE    = 256

EPOCHS          = 3
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020


DEVIDE_BY = 20

os.environ['WANDB_DISABLED'] = 'true'



tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained('/content/drive/MyDrive/All models/biogpt')



input_text = st.text_input("Please Provide your text:")
title = input_text
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token']
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)
device = torch.device("cuda")
model.cuda()
model.eval();
from heapq import nsmallest

# Generate text

if len(input_text)>0:
  sample_outputs = model.generate(generated,
                                do_sample=True,
                                max_length=MAXLEN,
                                top_k=10,
                                top_p=0.7,
                                temperature=0.5,
                                repetition_penalty=2.0,
                                num_return_sequences=1
                                )


  # Initialize an empty list to store the perplexity and text pairs
  perplexity_text_pairs = []


  for i, sample_output in enumerate(sample_outputs):
      text = tokenizer.decode(sample_output,skip_special_tokens=True)
      a = len(title)+25
      st.write(a)
      st.write(("{}: {}\n\n".format(i+1,  text[a:])))
      # all questions print in above cod
  bart_Val=text[a:]
  #x=summarizer(bart_Val, max_length=200, min_length=30, do_sample=False)
  #st.write('-------Bart summarization-----')
  #st.write(x[0]['summary_text'])

  #summary=summarizer_knnkar(bart_Val, max_length=200, min_length=30, do_sample=False)
  #st.write('-------MEETING_SUMMARY-----')
  #st.write(summary[0]['summary_text'])

  distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False)
  st.write('-------distilbart_cnn_12-6 model -----')
  st.write(distl[0]['summary_text'])



else:
  st.write('Welcome to GPT2')
# Create a "Regenerate" button


# Display output