cypher123gdr commited on
Commit
4fe8579
1 Parent(s): 44c7c5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -186
app.py CHANGED
@@ -1,196 +1,29 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.optim as optim
4
- from torchtext.data.utils import get_tokenizer
5
- from torchtext.vocab import build_vocab_from_iterator
6
- from torchtext.datasets import Multi30k
7
- from torch.utils.data import DataLoader, Dataset
8
- from collections import Counter
9
- import spacy
10
  import streamlit as st
 
 
 
11
 
12
- # Load English tokenizer
13
- spacy_en = spacy.load('en_core_web_sm')
14
- tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
15
 
16
- # Define a simple dataset class
17
- class SimpleDataset(Dataset):
18
- def __init__(self, texts, summaries, tokenizer, vocab):
19
- self.texts = texts
20
- self.summaries = summaries
21
- self.tokenizer = tokenizer
22
- self.vocab = vocab
23
 
24
- def __len__(self):
25
- return len(self.texts)
 
26
 
27
- def __getitem__(self, idx):
28
- text = self.texts[idx]
29
- summary = self.summaries[idx]
30
- text_tokens = [self.vocab[token] for token in self.tokenizer(text)]
31
- summary_tokens = [self.vocab[token] for token in self.tokenizer(summary)]
32
- return torch.tensor(text_tokens), torch.tensor(summary_tokens)
33
 
34
- # Example dataset
35
- data = {
36
- "text": ["The cat sat on the mat.", "The dog barked at the mailman.", "She sells seashells by the seashore."],
37
- "summary": ["Cat on mat.", "Dog barked.", "Seashells by seashore."]
38
- }
39
-
40
- texts = data["text"]
41
- summaries = data["summary"]
42
-
43
- # Build the vocabulary
44
- counter = Counter()
45
- for text in texts:
46
- counter.update(tokenizer(text))
47
-
48
- vocab = build_vocab_from_iterator([counter.keys()], specials=["<unk>", "<pad>", "<sos>", "<eos>"])
49
- vocab.set_default_index(vocab["<unk>"])
50
-
51
- # Create dataset and dataloader
52
- dataset = SimpleDataset(texts, summaries, tokenizer, vocab)
53
- dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
54
-
55
- # Define the model
56
- class Encoder(nn.Module):
57
- def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
58
- super().__init__()
59
-
60
- self.embedding = nn.Embedding(input_dim, emb_dim)
61
- self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
62
- self.dropout = nn.Dropout(dropout)
63
-
64
- def forward(self, src):
65
- embedded = self.dropout(self.embedding(src))
66
- outputs, (hidden, cell) = self.rnn(embedded)
67
- return hidden, cell
68
-
69
- class Decoder(nn.Module):
70
- def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
71
- super().__init__()
72
-
73
- self.output_dim = output_dim
74
- self.embedding = nn.Embedding(output_dim, emb_dim)
75
- self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
76
- self.fc_out = nn.Linear(hid_dim, output_dim)
77
- self.dropout = nn.Dropout(dropout)
78
-
79
- def forward(self, input, hidden, cell):
80
- input = input.unsqueeze(0)
81
- embedded = self.dropout(self.embedding(input))
82
- output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
83
- prediction = self.fc_out(output.squeeze(0))
84
- return prediction, hidden, cell
85
-
86
- class Seq2Seq(nn.Module):
87
- def __init__(self, encoder, decoder, device):
88
- super().__init__()
89
-
90
- self.encoder = encoder
91
- self.decoder = decoder
92
- self.device = device
93
-
94
- def forward(self, src, trg, teacher_forcing_ratio=0.5):
95
- trg_len = trg.shape[0]
96
- batch_size = trg.shape[1]
97
- trg_vocab_size = self.decoder.output_dim
98
-
99
- outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
100
- hidden, cell = self.encoder(src)
101
-
102
- input = trg[0,:]
103
-
104
- for t in range(1, trg_len):
105
- output, hidden, cell = self.decoder(input, hidden, cell)
106
- outputs[t] = output
107
- top1 = output.argmax(1)
108
- input = trg[t] if random.random() < teacher_forcing_ratio else top1
109
-
110
- return outputs
111
-
112
- INPUT_DIM = len(vocab)
113
- OUTPUT_DIM = len(vocab)
114
- ENC_EMB_DIM = 256
115
- DEC_EMB_DIM = 256
116
- HID_DIM = 512
117
- N_LAYERS = 2
118
- ENC_DROPOUT = 0.5
119
- DEC_DROPOUT = 0.5
120
-
121
- enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
122
- dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
123
-
124
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
125
-
126
- model = Seq2Seq(enc, dec, device).to(device)
127
-
128
- # Define optimizer and loss
129
- optimizer = optim.Adam(model.parameters())
130
- criterion = nn.CrossEntropyLoss()
131
-
132
- # Training the model
133
- def train(model, iterator, optimizer, criterion, clip):
134
- model.train()
135
- epoch_loss = 0
136
-
137
- for i, (src, trg) in enumerate(iterator):
138
- src = src.to(device)
139
- trg = trg.to(device)
140
-
141
- optimizer.zero_grad()
142
- output = model(src, trg)
143
- output_dim = output.shape[-1]
144
-
145
- output = output[1:].view(-1, output_dim)
146
- trg = trg[1:].view(-1)
147
-
148
- loss = criterion(output, trg)
149
- loss.backward()
150
-
151
- torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
152
- optimizer.step()
153
- epoch_loss += loss.item()
154
-
155
- return epoch_loss / len(iterator)
156
-
157
- # Dummy training loop for illustration
158
- N_EPOCHS = 10
159
- CLIP = 1
160
-
161
- for epoch in range(N_EPOCHS):
162
- train_loss = train(model, dataloader, optimizer, criterion, CLIP)
163
- print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')
164
-
165
- # Function for inference
166
- def summarize(text, tokenizer, vocab, model, device, max_len=50):
167
- model.eval()
168
- tokens = [token.lower() for token in tokenizer(text)]
169
- tokens = ["<sos>"] + tokens + ["<eos>"]
170
- src_indexes = [vocab[token] for token in tokens]
171
- src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
172
-
173
- with torch.no_grad():
174
- hidden, cell = model.encoder(src_tensor)
175
-
176
- trg_indexes = [vocab["<sos>"]]
177
-
178
- for i in range(max_len):
179
- trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
180
- with torch.no_grad():
181
- output, hidden, cell = model.decoder(trg_tensor, hidden, cell)
182
- pred_token = output.argmax(1).item()
183
- trg_indexes.append(pred_token)
184
- if pred_token == vocab["<eos>"]:
185
- break
186
-
187
- trg_tokens = [vocab.itos[i] for i in trg_indexes]
188
- return ' '.join(trg_tokens[1:-1])
189
-
190
- # Step 4: Create Streamlit App
191
- st.title("Text Summarization with PyTorch")
192
 
193
  user_input = st.text_area("Enter text to summarize")
194
  if st.button("Summarize"):
195
- summary = summarize(user_input, tokenizer, vocab, model, device)
 
 
196
  st.write(f"Summary: {summary}")
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import pipeline
3
+ import nltk
4
+ from nltk.tokenize import word_tokenize
5
 
6
+ # Download NLTK data
7
+ nltk.download('punkt')
 
8
 
9
+ # Load summarization pipeline
10
+ summarizer = pipeline("summarization")
 
 
 
 
 
11
 
12
+ # Function to tokenize text
13
+ def tokenize(text):
14
+ return word_tokenize(text)
15
 
16
+ # Function to summarize text
17
+ def summarize(text):
18
+ summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
19
+ return summary[0]['summary_text']
 
 
20
 
21
+ # Streamlit app
22
+ st.title("Text Summarization with Hugging Face Transformers")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  user_input = st.text_area("Enter text to summarize")
25
  if st.button("Summarize"):
26
+ tokens = tokenize(user_input)
27
+ st.write(f"Tokens: {tokens}")
28
+ summary = summarize(user_input)
29
  st.write(f"Summary: {summary}")