alifatmi commited on
Commit
858bb9d
1 Parent(s): a085127

Add application file

Browse files
Requriements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ Numpy
4
+ pandas
5
+ torch
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/biogpt",
3
+ "activation_dropout": 0.0,
4
+ "architectures": [
5
+ "BioGptForCausalLM"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.1,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-12,
16
+ "layerdrop": 0.0,
17
+ "max_position_embeddings": 1024,
18
+ "model_type": "biogpt",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "pad_token_id": 1,
22
+ "scale_embedding": true,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.27.4",
25
+ "use_cache": true,
26
+ "vocab_size": 42384
27
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.27.4"
7
+ }
merges (1).txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "sep_token": "</s>",
6
+ "unk_token": "<unk>"
7
+ }
streamlit.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ %%writefile app.py
4
+ import streamlit as st
5
+
6
+ st.title("HEALTHQUERY")
7
+ import os
8
+ import io
9
+ import requests
10
+ import numpy as np
11
+ import pandas as pd
12
+ import re
13
+ import zipfile
14
+ import random
15
+ import time
16
+ import csv
17
+ import datetime
18
+ from itertools import compress
19
+ from collections import Counter, defaultdict
20
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+
23
+ from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
24
+ AdamW, get_linear_schedule_with_warmup, \
25
+ TrainingArguments, BeamScorer, Trainer
26
+
27
+ import torch
28
+ from torch.utils.data import Dataset, random_split, DataLoader, \
29
+ RandomSampler, SequentialSampler
30
+
31
+ from IPython.display import clear_output
32
+ from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback
33
+ from transformers import pipeline
34
+ #summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")
35
+ #summarizer_knnkar = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
36
+ summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
37
+
38
+ import os
39
+ DEBUG = False
40
+
41
+ INPUT_DIR = 'articles'
42
+
43
+ USE_APEX = True
44
+ APEX_OPT_LEVEL = 'O1'
45
+
46
+ MODEL = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}
47
+
48
+ UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training
49
+
50
+ SPECIAL_TOKENS = { "bos_token": "<|BOS|>",
51
+ "eos_token": "<|EOS|>",
52
+ "unk_token": "<|UNK|>",
53
+ "pad_token": "<|PAD|>",
54
+ "sep_token": "<|SEP|>"}
55
+
56
+ MAXLEN = 256 #{768, 1024, 1280, 1600}
57
+
58
+ TRAIN_SIZE = 0.8
59
+
60
+ if USE_APEX:
61
+ TRAIN_BATCHSIZE = 16
62
+ BATCH_UPDATE = 128
63
+ else:
64
+ TRAIN_BATCHSIZE = 8
65
+ BATCH_UPDATE = 256
66
+
67
+ EPOCHS = 3
68
+ LR = 5e-4
69
+ EPS = 1e-8
70
+ WARMUP_STEPS = 1e2
71
+
72
+ SEED = 2020
73
+
74
+
75
+ DEVIDE_BY = 20
76
+
77
+ os.environ['WANDB_DISABLED'] = 'true'
78
+
79
+
80
+
81
+ tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
82
+ model = BioGptForCausalLM.from_pretrained('/content/drive/MyDrive/All models/biogpt')
83
+
84
+
85
+
86
+ input_text = st.text_input("Please Provide your text:")
87
+ title = input_text
88
+ prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token']
89
+ generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
90
+ device = torch.device("cuda")
91
+ generated = generated.to(device)
92
+ device = torch.device("cuda")
93
+ model.cuda()
94
+ model.eval();
95
+ from heapq import nsmallest
96
+
97
+ # Generate text
98
+
99
+ if len(input_text)>0:
100
+ sample_outputs = model.generate(generated,
101
+ do_sample=True,
102
+ max_length=MAXLEN,
103
+ top_k=10,
104
+ top_p=0.7,
105
+ temperature=0.5,
106
+ repetition_penalty=2.0,
107
+ num_return_sequences=1
108
+ )
109
+
110
+
111
+ # Initialize an empty list to store the perplexity and text pairs
112
+ perplexity_text_pairs = []
113
+
114
+
115
+ for i, sample_output in enumerate(sample_outputs):
116
+ text = tokenizer.decode(sample_output,skip_special_tokens=True)
117
+ a = len(title)+25
118
+ st.write(a)
119
+ st.write(("{}: {}\n\n".format(i+1, text[a:])))
120
+ # all questions print in above cod
121
+ bart_Val=text[a:]
122
+ #x=summarizer(bart_Val, max_length=200, min_length=30, do_sample=False)
123
+ #st.write('-------Bart summarization-----')
124
+ #st.write(x[0]['summary_text'])
125
+
126
+ #summary=summarizer_knnkar(bart_Val, max_length=200, min_length=30, do_sample=False)
127
+ #st.write('-------MEETING_SUMMARY-----')
128
+ #st.write(summary[0]['summary_text'])
129
+
130
+ distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False)
131
+ st.write('-------distilbart_cnn_12-6 model -----')
132
+ st.write(distl[0]['summary_text'])
133
+
134
+
135
+
136
+ else:
137
+ st.write('Welcome to GPT2')
138
+ # Create a "Regenerate" button
139
+
140
+
141
+ # Display output
142
+
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "model_max_length": 1024,
5
+ "pad_token": "<pad>",
6
+ "sep_token": "</s>",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "BioGptTokenizer",
9
+ "unk_token": "<unk>"
10
+ }
training_args (1).bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b09e76b23eb79b374880ad63eb99198f4a78cbed763a96e46e218d6c593e787
3
+ size 3579
vocab (1).json ADDED
The diff for this file is too large to render. See raw diff